Completed
Push — master ( efe8af...05cd1c )
by Andrei
01:17
created

cure.__process_by_ccore()   A

Complexity

Conditions 1

Size

Total Lines 13

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
dl 0
loc 13
rs 9.4285
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: CURE
4
@details Implementation based on paper @cite article::cure::1.
5
6
@authors Andrei Novikov ([email protected])
7
@date 2014-2018
8
@copyright GNU Public License
9
10
@cond GNU_PUBLIC_LICENSE
11
    PyClustering is free software: you can redistribute it and/or modify
12
    it under the terms of the GNU General Public License as published by
13
    the Free Software Foundation, either version 3 of the License, or
14
    (at your option) any later version.
15
    
16
    PyClustering is distributed in the hope that it will be useful,
17
    but WITHOUT ANY WARRANTY; without even the implied warranty of
18
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19
    GNU General Public License for more details.
20
    
21
    You should have received a copy of the GNU General Public License
22
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
23
@endcond
24
25
"""
26
27
28
import numpy
0 ignored issues
show
Configuration introduced by
The import numpy could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
29
30
from pyclustering.cluster.encoder import type_encoding
31
32
from pyclustering.utils import euclidean_distance
33
34
from pyclustering.container.kdtree import kdtree
35
36
from pyclustering.core.wrapper import ccore_library
37
38
import pyclustering.core.cure_wrapper as wrapper
39
40
41
class cure_cluster:
42
    """!
43
    @brief Represents data cluster in CURE term. 
44
    @details CURE cluster is described by points of cluster, representation points of the cluster and by the cluster center.
45
    
46
    """
47
    
48
    def __init__(self, point, index):
49
        """!
50
        @brief Constructor of CURE cluster.
51
        
52
        @param[in] point (list): Point represented by list of coordinates.
53
        @param[in] index (uint): Index point in dataset.
54
        
55
        """
56
        
57
        ## List of points that make up cluster.
58
        self.points = [ ]
59
        
60
        ## Point indexes in dataset.
61
        self.indexes = -1
62
        
63
        ## Mean of points that make up cluster.
64
        self.mean = None
65
        
66
        ## List of points that represents clusters.
67
        self.rep = [ ]
68
        
69
        if point is not None:
70
            self.points = [ point ]
71
            self.indexes = [ index ]
72
            self.mean = point
73
            self.rep = [ point ]
74
        
75
        ## Pointer to the closest cluster.
76
        self.closest = None
77
        
78
        ## Distance to the closest cluster.
79
        self.distance = float('inf')      # calculation of distance is really complexity operation (even square distance), so let's store distance to closest cluster.
80
81
    def __repr__(self):
82
        """!
83
        @brief Displays distance to closest cluster and points that are contained by current cluster.
84
        
85
        """
86
        return "%s, %s" % (self.distance, self.points)
87
        
88
89
class cure:
90
    """!
91
    @brief Class represents clustering algorithm CURE with KD-tree optimization.
92
    @details CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
93
    
94
    Example:
95
    @code
96
        # read data for clustering from some file
97
        sample = read_sample(path_to_data);
98
        
99
        # create instance of cure algorithm for cluster analysis
100
        # request for allocation of two clusters.
101
        cure_instance = cure(sample, 2, 5, 0.5, True);
102
        
103
        # run cluster analysis
104
        cure_instance.process();
105
        
106
        # get results of clustering
107
        clusters = cure_instance.get_clusters();
108
    @endcode
109
    
110
    """
111
    
112
    def __init__(self, data, number_cluster, number_represent_points = 5, compression = 0.5, ccore = True):
113
        """!
114
        @brief Constructor of clustering algorithm CURE.
115
        
116
        @param[in] data (array_like): Input data that should be processed.
117
        @param[in] number_cluster (uint): Number of clusters that should be allocated.
118
        @param[in] number_represent_points (uint): Number of representative points for each cluster.
119
        @param[in] compression (double): Coefficient defines level of shrinking of representation points toward the mean of the new created cluster after merging on each step. Usually it destributed from 0 to 1.
120
        @param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving.
121
        
122
        """
123
        
124
        self.__pointer_data = self.__prepare_data_points(data)
125
        
126
        self.__clusters = None
127
        self.__representors = None
128
        self.__means = None
129
        
130
        self.__number_cluster = number_cluster
131
        self.__number_represent_points = number_represent_points
132
        self.__compression = compression
133
134
        self.__ccore = ccore
135
        if self.__ccore:
136
            self.__ccore = ccore_library.workable()
137
138
        self.__validate_arguments()
139
140
141
    def process(self):
142
        """!
143
        @brief Performs cluster analysis in line with rules of CURE algorithm.
144
        
145
        @remark Results of clustering can be obtained using corresponding get methods.
146
        
147
        @see get_clusters()
148
        
149
        """
150
        
151
        if self.__ccore is True:
152
            self.__process_by_ccore()
153
            
154
        else:
155
            self.__process_by_python()
156
157
158
    def __process_by_ccore(self):
159
        """!
160
        @brief Performs cluster analysis using CCORE (C/C++ part of pyclustering library).
161
162
        """
163
        cure_data_pointer = wrapper.cure_algorithm(self.__pointer_data, self.__number_cluster,
164
                                                   self.__number_represent_points, self.__compression)
165
166
        self.__clusters = wrapper.cure_get_clusters(cure_data_pointer)
167
        self.__representors = wrapper.cure_get_representors(cure_data_pointer)
168
        self.__means = wrapper.cure_get_means(cure_data_pointer)
169
170
        wrapper.cure_data_destroy(cure_data_pointer)
171
172
173
    def __process_by_python(self):
174
        """!
175
        @brief Performs cluster analysis using python code.
176
177
        """
178
        self.__create_queue()  # queue
179
        self.__create_kdtree()  # create k-d tree
180
181
        while len(self.__queue) > self.__number_cluster:
182
            cluster1 = self.__queue[0]  # cluster that has nearest neighbor.
183
            cluster2 = cluster1.closest  # closest cluster.
184
185
            self.__queue.remove(cluster1)
186
            self.__queue.remove(cluster2)
187
188
            self.__delete_represented_points(cluster1)
189
            self.__delete_represented_points(cluster2)
190
191
            merged_cluster = self.__merge_clusters(cluster1, cluster2)
192
193
            self.__insert_represented_points(merged_cluster)
194
195
            # Pointers to clusters that should be relocated is stored here.
196
            cluster_relocation_requests = []
197
198
            # Check for the last cluster
199
            if len(self.__queue) > 0:
200
                merged_cluster.closest = self.__queue[0]  # arbitrary cluster from queue
201
                merged_cluster.distance = self.__cluster_distance(merged_cluster, merged_cluster.closest)
202
203
                for item in self.__queue:
204
                    distance = self.__cluster_distance(merged_cluster, item)
205
                    # Check if distance between new cluster and current is the best than now.
206
                    if distance < merged_cluster.distance:
207
                        merged_cluster.closest = item
208
                        merged_cluster.distance = distance
209
210
                    # Check if current cluster has removed neighbor.
211
                    if (item.closest is cluster1) or (item.closest is cluster2):
212
                        # If previous distance was less then distance to new cluster then nearest cluster should be found in the tree.
213
                        # print("Update: ", item);
214
                        if item.distance < distance:
215
                            (item.closest, item.distance) = self.__closest_cluster(item, distance)
216
217
                            # TODO: investigation of root cause is required.
218
                            # Itself and merged cluster should be always in list of neighbors in line with specified radius.
219
                            # But merged cluster may not be in list due to error calculation, therefore it should be added
220
                            # manually.
221
                            if item.closest is None:
222
                                item.closest = merged_cluster
223
                                item.distance = distance
224
225
                        # Otherwise new cluster is nearest.
226
                        else:
227
                            item.closest = merged_cluster
228
                            item.distance = distance
229
230
                        cluster_relocation_requests.append(item)
231
                    elif item.distance > distance:
232
                        item.closest = merged_cluster
233
                        item.distance = distance
234
235
                        cluster_relocation_requests.append(item)
236
237
            # New cluster and updated clusters should relocated in queue
238
            self.__insert_cluster(merged_cluster)
239
            for item in cluster_relocation_requests:
240
                self.__relocate_cluster(item)
241
242
        # Change cluster representation
243
        self.__clusters = [cure_cluster_unit.indexes for cure_cluster_unit in self.__queue]
244
        self.__representors = [cure_cluster_unit.rep for cure_cluster_unit in self.__queue]
245
        self.__means = [cure_cluster_unit.mean for cure_cluster_unit in self.__queue]
246
247
248
    def get_clusters(self):
249
        """!
250
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
251
        
252
        @return (list) List of allocated clusters.
253
        
254
        @see process()
255
        @see get_representors()
256
        @see get_means()
257
        
258
        """
259
        
260
        return self.__clusters
261
    
262
    
263
    def get_representors(self):
264
        """!
265
        @brief Returns list of point-representors of each cluster.
266
        @details Cluster index should be used for navigation between lists of point-representors.
267
        
268
        @return (list) List of point-representors of each cluster.
269
        
270
        @see get_clusters()
271
        @see get_means()
272
        
273
        """
274
        
275
        return self.__representors
276
    
277
    
278
    def get_means(self):
279
        """!
280
        @brief Returns list of mean values of each cluster.
281
        @details Cluster index should be used for navigation between mean values.
282
        
283
        @return (list) List of mean values of each cluster.
284
        
285
        @see get_clusters()
286
        @see get_representors()
287
        
288
        """
289
        
290
        return self.__means
291
292
293
    def get_cluster_encoding(self):
294
        """!
295
        @brief Returns clustering result representation type that indicate how clusters are encoded.
296
        
297
        @return (type_encoding) Clustering result representation.
298
        
299
        @see get_clusters()
300
        
301
        """
302
        
303
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION
304
305
306
    def __prepare_data_points(self, sample):
307
        """!
308
        @brief Prepare data points for clustering.
309
        @details In case of numpy.array there are a lot of overloaded basic operators, such as __contains__, __eq__.
310
311
        @return (list) Returns sample in list format.
312
313
        """
314
        if isinstance(sample, numpy.ndarray):
315
            return sample.tolist()
316
317
        return sample
318
319
320
    def __validate_arguments(self):
321
        """!
322
        @brief Check input arguments of BANG algorithm and if one of them is not correct then appropriate exception
323
                is thrown.
324
325
        """
326
327
        if len(self.__pointer_data) == 0:
328
            raise ValueError("Empty input data. Data should contain at least one point.")
329
330
        if self.__number_cluster <= 0:
331
            raise ValueError("Incorrect amount of clusters '%d'. Amount of cluster should be greater than 0." % self.__number_cluster)
332
333
        if self.__compression < 0:
334
            raise ValueError("Incorrect compression level '%f'. Compression should not be negative." % self.__compression)
335
336
        if self.__number_represent_points <= 0:
337
            raise ValueError("Incorrect amount of representatives '%d'. Amount of representatives should be greater than 0." % self.__number_cluster)
338
339
340
    def __insert_cluster(self, cluster):
341
        """!
342
        @brief Insert cluster to the list (sorted queue) in line with sequence order (distance).
343
        
344
        @param[in] cluster (cure_cluster): Cluster that should be inserted.
345
        
346
        """
347
        
348
        for index in range(len(self.__queue)):
349
            if cluster.distance < self.__queue[index].distance:
350
                self.__queue.insert(index, cluster)
351
                return
352
    
353
        self.__queue.append(cluster)
354
355
356
    def __relocate_cluster(self, cluster):
357
        """!
358
        @brief Relocate cluster in list in line with distance order.
359
        
360
        @param[in] cluster (cure_cluster): Cluster that should be relocated in line with order.
361
        
362
        """
363
        
364
        self.__queue.remove(cluster)
365
        self.__insert_cluster(cluster)
366
367
368
    def __closest_cluster(self, cluster, distance):
369
        """!
370
        @brief Find closest cluster to the specified cluster in line with distance.
371
        
372
        @param[in] cluster (cure_cluster): Cluster for which nearest cluster should be found.
373
        @param[in] distance (double): Closest distance to the previous cluster.
374
        
375
        @return (tuple) Pair (nearest CURE cluster, nearest distance) if the nearest cluster has been found, otherwise None is returned.
376
        
377
        """
378
        
379
        nearest_cluster = None
380
        nearest_distance = float('inf')
381
        
382
        for point in cluster.rep:
383
            # Nearest nodes should be returned (at least it will return itself).
384
            nearest_nodes = self.__tree.find_nearest_dist_nodes(point, distance)
385
            for (candidate_distance, kdtree_node) in nearest_nodes:
386
                if (candidate_distance < nearest_distance) and (kdtree_node is not None) and (kdtree_node.payload is not cluster):
387
                    nearest_distance = candidate_distance
388
                    nearest_cluster = kdtree_node.payload
389
                    
390
        return (nearest_cluster, nearest_distance)
391
392
393
    def __insert_represented_points(self, cluster):
394
        """!
395
        @brief Insert representation points to the k-d tree.
396
        
397
        @param[in] cluster (cure_cluster): Cluster whose representation points should be inserted.
398
        
399
        """
400
        
401
        for point in cluster.rep:
402
            self.__tree.insert(point, cluster)
403
404
405
    def __delete_represented_points(self, cluster): 
406
        """!
407
        @brief Remove representation points of clusters from the k-d tree
408
        
409
        @param[in] cluster (cure_cluster): Cluster whose representation points should be removed.
410
        
411
        """
412
        
413
        for point in cluster.rep:
414
            self.__tree.remove(point, payload=cluster)
415
416
417
    def __merge_clusters(self, cluster1, cluster2):
418
        """!
419
        @brief Merges two clusters and returns new merged cluster. Representation points and mean points are calculated for the new cluster.
420
        
421
        @param[in] cluster1 (cure_cluster): Cluster that should be merged.
422
        @param[in] cluster2 (cure_cluster): Cluster that should be merged.
423
        
424
        @return (cure_cluster) New merged CURE cluster.
425
        
426
        """
427
        
428
        merged_cluster = cure_cluster(None, None)
429
        
430
        merged_cluster.points = cluster1.points + cluster2.points
431
        merged_cluster.indexes = cluster1.indexes + cluster2.indexes
432
        
433
        # merged_cluster.mean = ( len(cluster1.points) * cluster1.mean + len(cluster2.points) * cluster2.mean ) / ( len(cluster1.points) + len(cluster2.points) );
434
        dimension = len(cluster1.mean)
435
        merged_cluster.mean = [0] * dimension
436
        if merged_cluster.points[1:] == merged_cluster.points[:-1]:
437
            merged_cluster.mean = merged_cluster.points[0]
438
        else:
439
            for index in range(dimension):
440
                merged_cluster.mean[index] = ( len(cluster1.points) * cluster1.mean[index] + len(cluster2.points) * cluster2.mean[index] ) / ( len(cluster1.points) + len(cluster2.points) );
441
        
442
        temporary = list()
443
        
444
        for index in range(self.__number_represent_points):
445
            maximal_distance = 0
446
            maximal_point = None
447
            
448
            for point in merged_cluster.points:
449
                minimal_distance = 0
450
                if index == 0:
451
                    minimal_distance = euclidean_distance(point, merged_cluster.mean)
452
                    #minimal_distance = euclidean_distance_sqrt(point, merged_cluster.mean);
453
                else:
454
                    minimal_distance = min([euclidean_distance(point, p) for p in temporary])
455
                    #minimal_distance = cluster_distance(cure_cluster(point), cure_cluster(temporary[0]));
456
                    
457
                if minimal_distance >= maximal_distance:
458
                    maximal_distance = minimal_distance
459
                    maximal_point = point
460
        
461
            if maximal_point not in temporary:
462
                temporary.append(maximal_point)
463
                
464
        for point in temporary:
465
            representative_point = [0] * dimension
466
            for index in range(dimension):
467
                representative_point[index] = point[index] + self.__compression * (merged_cluster.mean[index] - point[index])
468
                
469
            merged_cluster.rep.append(representative_point)
470
        
471
        return merged_cluster
472
473
474
    def __create_queue(self):
475
        """!
476
        @brief Create queue of sorted clusters by distance between them, where first cluster has the nearest neighbor. At the first iteration each cluster contains only one point.
477
        
478
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
479
        
480
        @return (list) Create queue of sorted clusters by distance between them.
481
        
482
        """
483
        
484
        self.__queue = [cure_cluster(self.__pointer_data[index_point], index_point) for index_point in range(len(self.__pointer_data))]
485
        
486
        # set closest clusters
487
        for i in range(0, len(self.__queue)):
488
            minimal_distance = float('inf')
489
            closest_index_cluster = -1
490
            
491
            for k in range(0, len(self.__queue)):
492
                if i != k:
493
                    dist = self.__cluster_distance(self.__queue[i], self.__queue[k])
494
                    if dist < minimal_distance:
495
                        minimal_distance = dist
496
                        closest_index_cluster = k
497
            
498
            self.__queue[i].closest = self.__queue[closest_index_cluster]
499
            self.__queue[i].distance = minimal_distance
500
        
501
        # sort clusters
502
        self.__queue.sort(key = lambda x: x.distance, reverse = False)
503
    
504
505
    def __create_kdtree(self):
506
        """!
507
        @brief Create k-d tree in line with created clusters. At the first iteration contains all points from the input data set.
508
        
509
        @return (kdtree) k-d tree that consist of representative points of CURE clusters.
510
        
511
        """
512
        
513
        self.__tree = kdtree()
514
        for current_cluster in self.__queue:
515
            for representative_point in current_cluster.rep:
516
                self.__tree.insert(representative_point, current_cluster)
517
518
519
    def __cluster_distance(self, cluster1, cluster2):
520
        """!
521
        @brief Calculate minimal distance between clusters using representative points.
522
        
523
        @param[in] cluster1 (cure_cluster): The first cluster.
524
        @param[in] cluster2 (cure_cluster): The second cluster.
525
        
526
        @return (double) Euclidean distance between two clusters that is defined by minimum distance between representation points of two clusters.
527
        
528
        """
529
        
530
        distance = float('inf')
531
        for i in range(0, len(cluster1.rep)):
532
            for k in range(0, len(cluster2.rep)):
533
                #dist = euclidean_distance_sqrt(cluster1.rep[i], cluster2.rep[k]);   # Fast mode
534
                dist = euclidean_distance(cluster1.rep[i], cluster2.rep[k])        # Slow mode
535
                if dist < distance:
536
                    distance = dist
537
                    
538
        return distance
539