Completed
Push — master ( 89ef1c...a90484 )
by Andrei
01:50
created

cure.get_means()   A

Complexity

Conditions 1

Size

Total Lines 13

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
dl 0
loc 13
rs 9.4285
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: CURE
4
@details Implementation based on article:
5
         - S.Guha, R.Rastogi, K.Shim. CURE: An Efficient Clustering Algorithm for Large Databases. 1998.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2016
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
from pyclustering.utils import euclidean_distance;
29
30
from pyclustering.container.kdtree import kdtree;
31
32
import pyclustering.core.cure_wrapper as wrapper;
33
34
class cure_cluster:
35
    """!
36
    @brief Represents data cluster in CURE term. CURE cluster is described by points of cluster, representation points of the cluster and by the cluster center.
37
    
38
    """
39
    
40
    def __init__(self, point = None):
41
        """!
42
        @brief Constructor of CURE cluster.
43
        
44
        @param[in] point (list): Point represented by list of coordinates.
45
        
46
        """
47
        
48
        ## List of points that make up cluster.
49
        self.points = [ ];
50
        
51
        ## Mean of points that make up cluster.
52
        self.mean = None;
53
        
54
        ## List of points that represents clusters.
55
        self.rep = [ ];
56
        
57
        if (point is not None):
58
            self.points = [ point ];
59
            self.mean = point;
60
            self.rep = [ point ];
61
        
62
        ## Pointer to the closest cluster.
63
        self.closest = None;
64
        
65
        ## Distance to the closest cluster.
66
        self.distance = float('inf');      # calculation of distance is really complexity operation (even square distance), so let's store distance to closest cluster.
67
68
    def __repr__(self):
69
        """!
70
        @brief Displays distance to closest cluster and points that are contained by current cluster.
71
        
72
        """
73
        return "%s, %s" % (self.distance, self.points);
74
        
75
76
class cure:
77
    """!
78
    @brief Class represents clustering algorithm CURE.
79
    
80
    Example:
81
    @code
82
        # read data for clustering from some file
83
        sample = read_sample(path_to_data);
84
        
85
        # create instance of cure algorithm for cluster analysis
86
        # request for allocation of two clusters.
87
        cure_instance = cure(sample, 2, 5, 0.5, True);
88
        
89
        # run cluster analysis
90
        cure_instance.process();
91
        
92
        # get results of clustering
93
        clusters = cure_instance.get_clusters();
94
    @endcode
95
    
96
    """
97
    
98
    def __init__(self, data, number_cluster, number_represent_points = 5, compression = 0.5, ccore = False):
99
        """!
100
        @brief Constructor of clustering algorithm CURE.
101
        
102
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
103
        @param[in] number_cluster (uint): Number of clusters that should be allocated.
104
        @param[in] number_represent_points (uint): Number of representative points for each cluster.
105
        @param[in] compression (double): Coefficient defines level of shrinking of representation points toward the mean of the new created cluster after merging on each step. Usually it destributed from 0 to 1.
106
        @param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving.
107
        
108
        """
109
        
110
        self.__pointer_data = data;
111
        
112
        self.__clusters = None;
113
        self.__representors = None;
114
        self.__means = None;
115
        
116
        self.__number_cluster = number_cluster;
117
        self.__number_represent_points = number_represent_points;
118
        self.__compression = compression;
119
        
120
        self.__ccore = ccore;
121
        
122
        if (ccore is False):
123
            self.__create_queue();      # queue
124
            self.__create_kdtree();     # create k-d tree
125
126
    
127
    def process(self):
128
        """!
129
        @brief Performs cluster analysis in line with rules of CURE algorithm.
130
        
131
        @remark Results of clustering can be obtained using corresponding get methods.
132
        
133
        @see get_clusters()
134
        
135
        """
136
        
137
        if (self.__ccore is True):
138
            cure_data_pointer = wrapper.cure_algorithm(self.__pointer_data, self.__number_cluster, self.__number_represent_points, self.__compression);
139
            
140
            self.__clusters = wrapper.cure_get_clusters(cure_data_pointer);
141
            self.__representors = wrapper.cure_get_representors(cure_data_pointer);
142
            self.__means = wrapper.cure_get_means(cure_data_pointer);
143
            
144
            wrapper.cure_data_destroy(cure_data_pointer);
145
            
146
        else:
147
            while (len(self.__queue) > self.__number_cluster):
148
                cluster1 = self.__queue[0];            # cluster that has nearest neighbor.
149
                cluster2 = cluster1.closest;    # closest cluster.
150
                
151
                #print("Merge decision: \n\t", cluster1, "\n\t", cluster2);
152
                
153
                self.__queue.remove(cluster1);
154
                self.__queue.remove(cluster2);
155
                
156
                self.__delete_represented_points(cluster1);
157
                self.__delete_represented_points(cluster2);
158
        
159
                merged_cluster = self.__merge_clusters(cluster1, cluster2);
160
        
161
                self.__insert_represented_points(merged_cluster);
162
                
163
                # Pointers to clusters that should be relocated is stored here.
164
                cluster_relocation_requests = [];
165
                
166
                # Check for the last cluster
167
                if (len(self.__queue) > 0):
168
                    merged_cluster.closest = self.__queue[0];  # arbitrary cluster from queue
169
                    merged_cluster.distance = self.__cluster_distance(merged_cluster, merged_cluster.closest);
170
                    
171
                    for item in self.__queue:
172
                        distance = self.__cluster_distance(merged_cluster, item);
173
                        # Check if distance between new cluster and current is the best than now.
174
                        if (distance < merged_cluster.distance):
175
                            merged_cluster.closest = item;
176
                            merged_cluster.distance = distance;
177
                        
178
                        # Check if current cluster has removed neighbor.
179
                        if ( (item.closest is cluster1) or (item.closest is cluster2) ):
180
                            # If previous distance was less then distance to new cluster then nearest cluster should be found in the tree.
181
                            #print("Update: ", item);
182
                            if (item.distance < distance):
183
                                (item.closest, item.distance) = self.__closest_cluster(item, distance);
184
                                
185
                                # TODO: investigation of root cause is required.
186
                                # Itself and merged cluster should be always in list of neighbors in line with specified radius.
187
                                # But merged cluster may not be in list due to error calculation, therefore it should be added
188
                                # manually.
189
                                if (item.closest is None):
190
                                    item.closest = merged_cluster;
191
                                    item.distance = distance;
192
                                
193
                            # Otherwise new cluster is nearest.
194
                            else:
195
                                item.closest = merged_cluster;
196
                                item.distance = distance;
197
                            
198
                            cluster_relocation_requests.append(item);
199
                        elif (item.distance > distance):
200
                            item.closest = merged_cluster;
201
                            item.distance = distance;
202
                            
203
                            cluster_relocation_requests.append(item);
204
                
205
                # New cluster and updated clusters should relocated in queue
206
                self.__insert_cluster(merged_cluster);
207
                for item in cluster_relocation_requests:
208
                    self.__relocate_cluster(item);
209
        
210
            # Change cluster representation
211
            self.__clusters = [ cure_cluster_unit.points for cure_cluster_unit in self.__queue ];
212
            self.__representors = [ cure_cluster_unit.rep for cure_cluster_unit in self.__queue ];
213
            self.__means = [ cure_cluster_unit.mean for cure_cluster_unit in self.__queue ];
214
    
215
    
216
    def get_clusters(self):
217
        """!
218
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
219
        
220
        @return (list) List of allocated clusters.
221
        
222
        @see process()
223
        @see get_representors()
224
        @see get_means()
225
        
226
        """
227
        
228
        return self.__clusters;  
229
    
230
    
231
    def get_representors(self):
232
        """!
233
        @brief Returns list of representors of each cluster.
234
        @details Cluster index should be used for navigation between lists of representors.
235
        
236
        @return (list) List of representors of each cluster.
237
        
238
        @see get_clusters()
239
        @see get_means()
240
        
241
        """
242
        
243
        return self.__representors;
244
    
245
    
246
    def get_means(self):
247
        """!
248
        @brief Returns list of mean values of each cluster.
249
        @details Cluster index should be used for navigation between mean values.
250
        
251
        @return (list) List of mean values of each cluster.
252
        
253
        @see get_clusters()
254
        @see get_representors()
255
        
256
        """
257
        
258
        return self.__means;
259
    
260
    
261
    def __insert_cluster(self, cluster):
262
        """!
263
        @brief Insert cluster to the list (sorted queue) in line with sequence order (distance).
264
        
265
        @param[in] cluster (cure_cluster): Cluster that should be inserted.
266
        
267
        """
268
        
269
        for index in range(len(self.__queue)):
270
            if (cluster.distance < self.__queue[index].distance):
271
                self.__queue.insert(index, cluster);
272
                return;
273
    
274
        self.__queue.append(cluster);        
275
276
277
    def __relocate_cluster(self, cluster):
278
        """!
279
        @brief Relocate cluster in list in line with distance order.
280
        
281
        @param[in] cluster (cure_cluster): Cluster that should be relocated in line with order.
282
        
283
        """
284
        
285
        self.__queue.remove(cluster);
286
        self.__insert_cluster(cluster);
287
288
289
    def __closest_cluster(self, cluster, distance):
290
        """!
291
        @brief Find closest cluster to the specified cluster in line with distance.
292
        
293
        @param[in] cluster (cure_cluster): Cluster for which nearest cluster should be found.
294
        @param[in] distance (double): Closest distance to the previous cluster.
295
        
296
        @return (tuple) Pair (nearest CURE cluster, nearest distance) if the nearest cluster has been found, otherwise None is returned.
297
        
298
        """
299
        
300
        nearest_cluster = None;
301
        nearest_distance = float('inf');
302
        
303
        for point in cluster.rep:
304
            # Nearest nodes should be returned (at least it will return itself).
305
            nearest_nodes = self.__tree.find_nearest_dist_nodes(point, distance);
306
            for (candidate_distance, kdtree_node) in nearest_nodes:
307
                if ( (candidate_distance < nearest_distance) and (kdtree_node is not None) and (kdtree_node.payload is not cluster) ):
308
                    nearest_distance = candidate_distance;
309
                    nearest_cluster = kdtree_node.payload;
310
                    
311
        return (nearest_cluster, nearest_distance);
312
313
314
    def __insert_represented_points(self, cluster):
315
        """!
316
        @brief Insert representation points to the k-d tree.
317
        
318
        @param[in] cluster (cure_cluster): Cluster whose representation points should be inserted.
319
        
320
        """
321
        
322
        for point in cluster.rep:
323
            self.__tree.insert(point, cluster);
324
325
326
    def __delete_represented_points(self, cluster): 
327
        """!
328
        @brief Remove representation points of clusters from the k-d tree
329
        
330
        @param[in] cluster (cure_cluster): Cluster whose representation points should be removed.
331
        
332
        """
333
        
334
        for point in cluster.rep:
335
            self.__tree.remove(point);
336
337
338
    def __merge_clusters(self, cluster1, cluster2):
339
        """!
340
        @brief Merges two clusters and returns new merged cluster. Representation points and mean points are calculated for the new cluster.
341
        
342
        @param[in] cluster1 (cure_cluster): Cluster that should be merged.
343
        @param[in] cluster2 (cure_cluster): Cluster that should be merged.
344
        
345
        @return (cure_cluster) New merged CURE cluster.
346
        
347
        """
348
        
349
        merged_cluster = cure_cluster();
350
        
351
        merged_cluster.points = cluster1.points + cluster2.points;
352
        
353
        # merged_cluster.mean = ( len(cluster1.points) * cluster1.mean + len(cluster2.points) * cluster2.mean ) / ( len(cluster1.points) + len(cluster2.points) );
354
        dimension = len(cluster1.mean);
355
        merged_cluster.mean = [0] * dimension;
356
        if merged_cluster.points[1:] == merged_cluster.points[:-1]:
357
            merged_cluster.mean = merged_cluster.points[0]
358
        else:
359
            for index in range(dimension):
360
                merged_cluster.mean[index] = ( len(cluster1.points) * cluster1.mean[index] + len(cluster2.points) * cluster2.mean[index] ) / ( len(cluster1.points) + len(cluster2.points) );
361
        
362
        temporary = list();
363
        
364
        for index in range(self.__number_represent_points):
365
            maximal_distance = 0;
366
            maximal_point = None;
367
            
368
            for point in merged_cluster.points:
369
                minimal_distance = 0;
370
                if (index == 0):
371
                    minimal_distance = euclidean_distance(point, merged_cluster.mean);
372
                    #minimal_distance = euclidean_distance_sqrt(point, merged_cluster.mean);
373
                else:
374
                    minimal_distance = min([euclidean_distance(point, p) for p in temporary]);
375
                    #minimal_distance = cluster_distance(cure_cluster(point), cure_cluster(temporary[0]));
376
                    
377
                if (minimal_distance >= maximal_distance):
378
                    maximal_distance = minimal_distance;
379
                    maximal_point = point;
380
        
381
            if (maximal_point not in temporary):
382
                temporary.append(maximal_point);
383
                
384
        for point in temporary:
385
            representative_point = [0] * dimension;
386
            for index in range(dimension):
387
                representative_point[index] = point[index] + self.__compression * (merged_cluster.mean[index] - point[index]);
388
                
389
            merged_cluster.rep.append(representative_point);
390
        
391
        return merged_cluster;
392
393
394
    def __create_queue(self):
395
        """!
396
        @brief Create queue of sorted clusters by distance between them, where first cluster has the nearest neighbor. At the first iteration each cluster contains only one point.
397
        
398
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
399
        
400
        @return (list) Create queue of sorted clusters by distance between them.
401
        
402
        """
403
        
404
        self.__queue = [cure_cluster(point) for point in self.__pointer_data];
405
        
406
        # set closest clusters
407
        for i in range(0, len(self.__queue)):
408
            minimal_distance = float('inf');
409
            closest_index_cluster = -1;
410
            
411
            for k in range(0, len(self.__queue)):
412
                if (i != k):
413
                    dist = self.__cluster_distance(self.__queue[i], self.__queue[k]);
414
                    if (dist < minimal_distance):
415
                        minimal_distance = dist;
416
                        closest_index_cluster = k;
417
            
418
            self.__queue[i].closest = self.__queue[closest_index_cluster];
419
            self.__queue[i].distance = minimal_distance;
420
        
421
        # sort clusters
422
        self.__queue.sort(key = lambda x: x.distance, reverse = False);
423
    
424
425
    def __create_kdtree(self):
426
        """!
427
        @brief Create k-d tree in line with created clusters. At the first iteration contains all points from the input data set.
428
        
429
        @return (kdtree) k-d tree that consist of representative points of CURE clusters.
430
        
431
        """
432
        
433
        self.__tree = kdtree();
434
        for current_cluster in self.__queue:
435
            for representative_point in current_cluster.rep:
436
                self.__tree.insert(representative_point, current_cluster);    
437
438
439
    def __cluster_distance(self, cluster1, cluster2):
440
        """!
441
        @brief Calculate minimal distance between clusters using representative points.
442
        
443
        @param[in] cluster1 (cure_cluster): The first cluster.
444
        @param[in] cluster2 (cure_cluster): The second cluster.
445
        
446
        @return (double) Euclidean distance between two clusters that is defined by minimum distance between representation points of two clusters.
447
        
448
        """
449
        
450
        distance = float('inf');
451
        for i in range(0, len(cluster1.rep)):
452
            for k in range(0, len(cluster2.rep)):
453
                #dist = euclidean_distance_sqrt(cluster1.rep[i], cluster2.rep[k]);   # Fast mode
454
                dist = euclidean_distance(cluster1.rep[i], cluster2.rep[k]);        # Slow mode
455
                if (dist < distance):
456
                    distance = dist;
457
                    
458
        return distance;
459