Completed
Push — master ( aa7a06...786fcc )
by Andrei
01:40
created

cure_cluster.__init__()   B

Complexity

Conditions 2

Size

Total Lines 27

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 2
dl 0
loc 27
rs 8.8571
1
"""!
2
3
@brief Cluster analysis algorithm: CURE
4
@details Implementation based on article:
5
         - S.Guha, R.Rastogi, K.Shim. CURE: An Efficient Clustering Algorithm for Large Databases. 1998.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2016
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
from pyclustering.utils import euclidean_distance;
29
from pyclustering.utils import euclidean_distance_sqrt;
0 ignored issues
show
Unused Code introduced by
Unused euclidean_distance_sqrt imported from pyclustering.utils
Loading history...
30
31
from pyclustering.container.kdtree import kdtree;
32
33
import pyclustering.core.wrapper as wrapper;
34
35
class cure_cluster:   
36
    """!
37
    @brief Represents data cluster in CURE term. CURE cluster is described by points of cluster, representation points of the cluster and by the cluster center.
38
    
39
    """
40
    
41
    def __init__(self, point = None):
42
        """!
43
        @brief Constructor of CURE cluster.
44
        
45
        @param[in] point (list): Point represented by list of coordinates.
46
        
47
        """
48
        
49
        ## List of points that make up cluster.
50
        self.points = [ ];
51
        
52
        ## Mean of points that make up cluster.
53
        self.mean = None;
54
        
55
        ## List of points that represents clusters.
56
        self.rep = [ ];
57
        
58
        if (point is not None):
59
            self.points = [ point ];
60
            self.mean = point;
61
            self.rep = [ point ];
62
        
63
        ## Pointer to the closest cluster.
64
        self.closest = None;
65
        
66
        ## Distance to the closest cluster.
67
        self.distance = float('inf');      # calculation of distance is really complexity operation (even square distance), so let's store distance to closest cluster.
68
69
    def __repr__(self):
70
        """!
71
        @brief Displays distance to closest cluster and points that are contained by current cluster.
72
        
73
        """
74
        return "%s, %s" % (self.distance, self.points);
75
        
76
77
class cure:
78
    """!
79
    @brief Class represents clustering algorithm CURE.
80
    
81
    Example:
82
    @code
83
        # read data for clustering from some file
84
        sample = read_sample(path_to_data);
85
        
86
        # create instance of cure algorithm for cluster analysis
87
        # request for allocation of two clusters.
88
        cure_instance = cure(sample, 2, 5, 0.5, True);
89
        
90
        # run cluster analysis
91
        cure_instance.process();
92
        
93
        # get results of clustering
94
        clusters = cure_instance.get_clusters();
95
    @endcode
96
    
97
    """
98
    __pointer_data = None;
99
    __clusters = None;
100
    
101
    __number_cluster = 0;
102
    __number_represent_points = 0;
103
    __compression = 0;
104
    
105
    __queue = None;
106
    __tree = None;
107
    
108
    __ccore = False;
109
    
110
    def __init__(self, data, number_cluster, number_represent_points = 5, compression = 0.5, ccore = False):
111
        """!
112
        @brief Constructor of clustering algorithm CURE.
113
        
114
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
115
        @param[in] number_cluster (uint): Number of clusters that should be allocated.
116
        @param[in] number_represent_points (uint): Number of representative points for each cluster.
117
        @param[in] compression (double): Coefficient defines level of shrinking of representation points toward the mean of the new created cluster after merging on each step. Usually it destributed from 0 to 1.
118
        @param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving.
119
        
120
        """
121
                
122
        self.__pointer_data = data;
123
        self.__number_cluster = number_cluster;
124
        self.__number_represent_points = number_represent_points;
125
        self.__compression = compression;
126
        
127
        self.__ccore = ccore;
128
        
129
        if (ccore is False):
130
            self.__create_queue();      # queue
131
            self.__create_kdtree();     # create k-d tree
132
133
    
134
    def process(self):
135
        """!
136
        @brief Performs cluster analysis in line with rules of CURE algorithm.
137
        
138
        @remark Results of clustering can be obtained using corresponding get methods.
139
        
140
        @see get_clusters()
141
        
142
        """
143
        
144
        if (self.__ccore is True):
145
            self.__clusters = wrapper.cure(self.__pointer_data, self.__number_cluster, self.__number_represent_points, self.__compression);
146
        else:
147
148
            while (len(self.__queue) > self.__number_cluster):
149
                cluster1 = self.__queue[0];            # cluster that has nearest neighbor.
150
                cluster2 = cluster1.closest;    # closest cluster.
151
                
152
                #print("Merge decision: \n\t", cluster1, "\n\t", cluster2);
153
                
154
                self.__queue.remove(cluster1);
155
                self.__queue.remove(cluster2);
156
                
157
                self.__delete_represented_points(cluster1);
158
                self.__delete_represented_points(cluster2);
159
        
160
                merged_cluster = self.__merge_clusters(cluster1, cluster2);
161
        
162
                self.__insert_represented_points(merged_cluster);
163
                
164
                # Pointers to clusters that should be relocated is stored here.
165
                cluster_relocation_requests = [];
166
                
167
                # Check for the last cluster
168
                if (len(self.__queue) > 0):
169
                    merged_cluster.closest = self.__queue[0];  # arbitrary cluster from queue
170
                    merged_cluster.distance = self.__cluster_distance(merged_cluster, merged_cluster.closest);
171
                    
172
                    for item in self.__queue:
173
                        distance = self.__cluster_distance(merged_cluster, item);
174
                        # Check if distance between new cluster and current is the best than now.
175
                        if (distance < merged_cluster.distance):
176
                            merged_cluster.closest = item;
177
                            merged_cluster.distance = distance;
178
                        
179
                        # Check if current cluster has removed neighbor.
180
                        if ( (item.closest is cluster1) or (item.closest is cluster2) ):
181
                            # If previous distance was less then distance to new cluster then nearest cluster should be found in the tree.
182
                            #print("Update: ", item);
183
                            if (item.distance < distance):
184
                                (item.closest, item.distance) = self.__closest_cluster(item, distance);
185
                                
186
                                # TODO: investigation of root cause is required.
187
                                # Itself and merged cluster should be always in list of neighbors in line with specified radius.
188
                                # But merged cluster may not be in list due to error calculation, therefore it should be added
189
                                # manually.
190
                                if (item.closest is None):
191
                                    item.closest = merged_cluster;
192
                                    item.distance = distance;
193
                                
194
                            # Otherwise new cluster is nearest.
195
                            else:
196
                                item.closest = merged_cluster;
197
                                item.distance = distance;
198
                            
199
                            cluster_relocation_requests.append(item);
200
                        elif (item.distance > distance):
201
                            item.closest = merged_cluster;
202
                            item.ditance = distance;
203
                            
204
                            cluster_relocation_requests.append(item);
205
                
206
                # New cluster and updated clusters should relocated in queue
207
                self.__insert_cluster(merged_cluster);
208
                [self.__relocate_cluster(item) for item in cluster_relocation_requests];
0 ignored issues
show
Unused Code Bug introduced by
The expression [self.__relocate_cluster...er_relocation_requests] does not seem to have sideeffects and its result is not used.

If a expression has no sideeffects (any lasting effect after it has been called) and its return value is not used, this usually means that this code can be removed or that an assignment is missing.

Loading history...
209
        
210
            # Change cluster representation
211
            self.__clusters = [ cure_cluster_unit.points for cure_cluster_unit in self.__queue ];
212
    
213
    
214
    def get_clusters(self):
215
        """!
216
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
217
        
218
        @return (list) List of allocated clusters.
219
        
220
        @see process()
221
        
222
        """
223
        
224
        return self.__clusters;  
225
    
226
    
227
    def __insert_cluster(self, cluster):
228
        """!
229
        @brief Insert cluster to the list (sorted queue) in line with sequence order (distance).
230
        
231
        @param[in] cluster (cure_cluster): Cluster that should be inserted.
232
        
233
        """
234
        
235
        for index in range(len(self.__queue)):
236
            if (cluster.distance < self.__queue[index].distance):
237
                self.__queue.insert(index, cluster);
238
                return;
239
    
240
        self.__queue.append(cluster);        
241
242
243
    def __relocate_cluster(self, cluster):
244
        """!
245
        @brief Relocate cluster in list in line with distance order.
246
        
247
        @param[in] cluster (cure_cluster): Cluster that should be relocated in line with order.
248
        
249
        """
250
        
251
        self.__queue.remove(cluster);
252
        self.__insert_cluster(cluster);
253
254
255
    def __closest_cluster(self, cluster, distance):
256
        """!
257
        @brief Find closest cluster to the specified cluster in line with distance.
258
        
259
        @param[in] cluster (cure_cluster): Cluster for which nearest cluster should be found.
260
        @param[in] distance (double): Closest distance to the previous cluster.
261
        
262
        @return (tuple) Pair (nearest CURE cluster, nearest distance) if the nearest cluster has been found, otherwise None is returned.
263
        
264
        """
265
        
266
        nearest_cluster = None;
267
        nearest_distance = float('inf');
268
        
269
        for point in cluster.rep:
270
            # Nearest nodes should be returned (at least it will return itself).
271
            nearest_nodes = self.__tree.find_nearest_dist_nodes(point, distance);
272
            for (candidate_distance, kdtree_node) in nearest_nodes:
273
                if ( (candidate_distance < nearest_distance) and (kdtree_node is not None) and (kdtree_node.payload is not cluster) ):
274
                    nearest_distance = candidate_distance;
275
                    nearest_cluster = kdtree_node.payload;
276
                    
277
        return (nearest_cluster, nearest_distance);
278
279
280
    def __insert_represented_points(self, cluster):
281
        """!
282
        @brief Insert representation points to the k-d tree.
283
        
284
        @param[in] cluster (cure_cluster): Cluster whose representation points should be inserted.
285
        
286
        """
287
        
288
        for point in cluster.rep:
289
            self.__tree.insert(point, cluster);
290
291
292
    def __delete_represented_points(self, cluster): 
293
        """!
294
        @brief Remove representation points of clusters from the k-d tree
295
        
296
        @param[in] cluster (cure_cluster): Cluster whose representation points should be removed.
297
        
298
        """
299
        
300
        for point in cluster.rep:
301
            self.__tree.remove(point);
302
303
304
    def __merge_clusters(self, cluster1, cluster2):
305
        """!
306
        @brief Merges two clusters and returns new merged cluster. Representation points and mean points are calculated for the new cluster.
307
        
308
        @param[in] cluster1 (cure_cluster): Cluster that should be merged.
309
        @param[in] cluster2 (cure_cluster): Cluster that should be merged.
310
        
311
        @return (cure_cluster) New merged CURE cluster.
312
        
313
        """
314
        
315
        merged_cluster = cure_cluster();
316
        
317
        merged_cluster.points = cluster1.points + cluster2.points;
318
        
319
        # merged_cluster.mean = ( len(cluster1.points) * cluster1.mean + len(cluster2.points) * cluster2.mean ) / ( len(cluster1.points) + len(cluster2.points) );
320
        dimension = len(cluster1.mean);
321
        merged_cluster.mean = [0] * dimension;
322
        if merged_cluster.points[1:] == merged_cluster.points[:-1]:
323
            merged_cluster.mean = merged_cluster.points[0]
324
        else:
325
            for index in range(dimension):
326
                merged_cluster.mean[index] = ( len(cluster1.points) * cluster1.mean[index] + len(cluster2.points) * cluster2.mean[index] ) / ( len(cluster1.points) + len(cluster2.points) );
327
        
328
        temporary = list(); # TODO: Set should be used in line with specification (article), but list is not hashable object therefore it's impossible to use list in this fucking set!
329
        
330
        for index in range(self.__number_represent_points):
331
            maximal_distance = 0;
332
            maximal_point = None;
333
            
334
            for point in merged_cluster.points:
335
                minimal_distance = 0;
336
                if (index == 0):
337
                    minimal_distance = euclidean_distance(point, merged_cluster.mean);
338
                    #minimal_distance = euclidean_distance_sqrt(point, merged_cluster.mean);
339
                else:
340
                    minimal_distance = euclidean_distance(point, temporary[0]);
341
                    #minimal_distance = cluster_distance(cure_cluster(point), cure_cluster(temporary[0]));
342
                    
343
                if (minimal_distance >= maximal_distance):
344
                    maximal_distance = minimal_distance;
345
                    maximal_point = point;
346
        
347
            if (maximal_point not in temporary):
348
                temporary.append(maximal_point);
349
                
350
        for point in temporary:
351
            representative_point = [0] * dimension;
352
            for index in range(dimension):
353
                representative_point[index] = point[index] + self.__compression * (merged_cluster.mean[index] - point[index]);
354
                
355
            merged_cluster.rep.append(representative_point);
356
        
357
        return merged_cluster;
358
359
360
    def __create_queue(self):
361
        """!
362
        @brief Create queue of sorted clusters by distance between them, where first cluster has the nearest neighbor. At the first iteration each cluster contains only one point.
363
        
364
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
365
        
366
        @return (list) Create queue of sorted clusters by distance between them.
367
        
368
        """
369
        
370
        self.__queue = [cure_cluster(point) for point in self.__pointer_data];
371
        
372
        # set closest clusters
373
        for i in range(0, len(self.__queue)):
374
            minimal_distance = float('inf');
375
            closest_index_cluster = -1;
376
            
377
            for k in range(0, len(self.__queue)):
378
                if (i != k):
379
                    dist = self.__cluster_distance(self.__queue[i], self.__queue[k]);
380
                    if (dist < minimal_distance):
381
                        minimal_distance = dist;
382
                        closest_index_cluster = k;
383
            
384
            self.__queue[i].closest = self.__queue[closest_index_cluster];
385
            self.__queue[i].distance = minimal_distance;
386
        
387
        # sort clusters
388
        self.__queue.sort(key = lambda x: x.distance, reverse = False);
389
    
390
391
    def __create_kdtree(self):
392
        """!
393
        @brief Create k-d tree in line with created clusters. At the first iteration contains all points from the input data set.
394
        
395
        @return (kdtree) k-d tree that consist of representative points of CURE clusters.
396
        
397
        """
398
        
399
        self.__tree = kdtree();
400
        for current_cluster in self.__queue:
401
            for representative_point in current_cluster.rep:
402
                self.__tree.insert(representative_point, current_cluster);    
403
404
405
    def __cluster_distance(self, cluster1, cluster2):
406
        """!
407
        @brief Calculate minimal distance between clusters using representative points.
408
        
409
        @param[in] cluster1 (cure_cluster): The first cluster.
410
        @param[in] cluster2 (cure_cluster): The second cluster.
411
        
412
        @return (double) Euclidean distance between two clusters that is defined by minimum distance between representation points of two clusters.
413
        
414
        """
415
        
416
        distance = float('inf');
417
        for i in range(0, len(cluster1.rep)):
418
            for k in range(0, len(cluster2.rep)):
419
                #dist = euclidean_distance_sqrt(cluster1.rep[i], cluster2.rep[k]);   # Fast mode
420
                dist = euclidean_distance(cluster1.rep[i], cluster2.rep[k]);        # Slow mode
421
                if (dist < distance):
422
                    distance = dist;
423
                    
424
        return distance;
425