Completed
Push — master ( bfa7eb...b05035 )
by Andrei
01:44
created

ordering_visualizer.show_ordering_diagram()   B

Complexity

Conditions 3

Size

Total Lines 31

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
dl 0
loc 31
rs 8.8571
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: OPTICS (Ordering Points To Identify Clustering Structure)
4
@details Based on article description:
5
         - M.Ankerst, M.Breunig, H.Kriegel, J.Sander. OPTICS: Ordering Points To Identify the Clustering Structure. 1999.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2017
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
from pyclustering.cluster.encoder import type_encoding;
30
31
from pyclustering.utils import euclidean_distance;
32
33
import matplotlib.pyplot as plt;
0 ignored issues
show
Configuration introduced by
The import matplotlib.pyplot could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
34
35
import pyclustering.core.optics_wrapper as wrapper;
36
37
38
class ordering_visualizer:
39
    """!
40
    @brief Cluster ordering diagram visualizer that represents dataset graphically as density-based clustering structure.
41
    
42
    @see ordering_analyser
43
    
44
    """
45
    
46
    @staticmethod
47
    def show_ordering_diagram(analyser, amount_clusters = None):
48
        """!
49
        @brief Display cluster-ordering diagram.
50
        
51
        @param[in] analyser (ordering_analyser): cluster-ordering analyser whose ordering diagram should be displayed.
52
        @param[in] amount_clusters (uint): if it is not 'None' then it displays connectivity radius line that can used for allocation of specified amount of clusters.
53
        
54
        Example demonstrates general abilities of 'ordering_visualizer' class:
55
        @code
56
        # Display cluster-ordering diagram with connectivity radius is used for allocation of three clusters.
57
        ordering_visualizer.show_ordering_diagram(analyser, 3);
58
        
59
        # Display cluster-ordering diagram without radius.
60
        ordering_visualizer.show_ordering_diagram(analyser);
61
        @endcode
62
        
63
        """
64
        ordering = analyser.cluster_ordering;
65
        indexes = [i for i in range(0, len(ordering))];
66
        
67
        axis = plt.subplot(111);
68
        axis.bar(indexes, ordering, color = 'black');
69
        plt.xlim([0, len(ordering)]);
70
        
71
        if (amount_clusters is not None):
72
            radius = analyser.calculate_connvectivity_radius(amount_clusters);
73
            plt.axhline(y = analyser.calculate_connvectivity_radius(amount_clusters), linewidth = 2, color = 'b');
74
            plt.text(0, radius + radius * 0.03, " Radius:   " + str(round(radius, 4)) + ";\n Clusters: " + str(amount_clusters), color = 'b', fontsize = 10);
75
        
76
        plt.show();
77
78
79
class ordering_analyser:
80
    """!
81
    @brief Analyser of cluster ordering diagram.
82
    @details Using cluster-ordering it is able to connectivity radius for allocation of specified amount of clusters and
83
              calculate amount of clusters using specified connectivity radius. Cluster-ordering is formed by OPTICS algorithm
84
              during cluster analysis.
85
    
86
    @see optics
87
    
88
    """
89
    
90
    @property
91
    def cluster_ordering(self):
92
        """!
93
        @brief (list) Returns values of dataset cluster ordering.
94
        
95
        """
96
        return self.__ordering;
97
    
98
    
99
    def __init__(self, ordering_diagram):
100
        """!
101
        @brief Analyser of ordering diagram that is based on reachability-distances.
102
        
103
        @see calculate_connvectivity_radius
104
        
105
        """
106
        self.__ordering = ordering_diagram;
107
    
108
    
109
    def __len__(self):
110
        """!
111
        @brief Returns length of clustering-ordering diagram.
112
        
113
        """
114
        return len(self.__ordering);
115
    
116
    
117
    def calculate_connvectivity_radius(self, amount_clusters):
118
        """!
119
        @brief Calculates connectivity radius of allocation specified amount of clusters using ordering diagram.
120
        
121
        @param[in] amount_clusters (uint): amount of clusters that should be allocated by calculated connectivity radius.
122
        
123
        @return (double) Value of connectivity radius.
124
        
125
        """
126
        
127
        maximum_distance = max(self.__ordering);
128
        
129
        upper_distance = maximum_distance;
130
        lower_distance = 0.0;
131
        
132
        radius = None;
133
        
134
        if (self.extract_cluster_amount(maximum_distance) <= amount_clusters):
135
            while(True):
136
                radius = (lower_distance + upper_distance) / 2.0;
137
                
138
                amount = self.extract_cluster_amount(radius);
139
                if (amount == amount_clusters):
140
                    break;
141
                
142
                elif (amount > amount_clusters):
143
                    lower_distance = radius;
144
                
145
                elif (amount < amount_clusters):
146
                    upper_distance = radius;
147
        
148
        return radius;
149
    
150
    
151
    def extract_cluster_amount(self, radius):
152
        """!
153
        @brief Obtains amount of clustering that can be allocated by using specified radius for ordering diagram.
154
        @details When growth of reachability-distances is detected than it is considered as a start point of cluster, 
155
                 than pick is detected and after that recession is observed until new growth (that means end of the
156
                 current cluster and start of a new one) or end of diagram.
157
        
158
        @param[in] radius (double): connectivity radius that is used for cluster allocation.
159
        
160
        
161
        @return (unit) Amount of clusters that can be allocated by the connectivity radius on ordering diagram.
162
        
163
        """
164
        
165
        amount_clusters = 1;
166
        
167
        cluster_start = False;
168
        cluster_pick = False;
169
        previous_distance = 0.0;
170
        
171
        for distance in self.__ordering:
172
            if (distance >= radius):
173
                if (cluster_start is False):
174
                    cluster_start = True;
175
                    amount_clusters += 1;
176
                
177
                else:
178
                    if ((distance < previous_distance) and (cluster_pick is False)):
179
                        cluster_pick = True;
180
                    
181
                    elif ((distance > previous_distance) and (cluster_pick is True)):
182
                        cluster_pick = False;
183
                        amount_clusters += 1;
184
                
185
                previous_distance = distance;
186
            
187
            else:
188
                cluster_start = False;
189
                cluster_pick = False;
190
        
191
        return amount_clusters;
192
193
194
class optics_descriptor:
195
    """!
196
    @brief Object description that used by OPTICS algorithm for cluster analysis.
197
    
198
    """
199
200
    def __init__(self, index, core_distance = None, reachability_distance = None):
201
        """!
202
        @brief Constructor of object description in optics terms.
203
        
204
        @param[in] index (uint): Index of the object in the data set.
205
        @param[in] core_distance (double): Core distance that is minimum distance to specified number of neighbors.
206
        @param[in] reachability_distance (double): Reachability distance to this object.
207
        
208
        """
209
        
210
        ## Reachability distance - the smallest distance to be reachable by core object.
211
        self.index_object = index;
212
        
213
        ## Core distance - the smallest distance to reach specified number of neighbors that is not greater then connectivity radius.
214
        self.core_distance = core_distance;
215
        
216
        ## Index of object from the input data.
217
        self.reachability_distance = reachability_distance;
218
        
219
        ## True is object has been already traversed.
220
        self.processed = False;
221
222
    def __repr__(self):
223
        """!
224
        @brief Returns string representation of the optics descriptor.
225
        
226
        """
227
        
228
        return '(%s, [c: %s, r: %s])' % (self.index_object, self.core_distance, self.reachability_distance);
229
230
231
class optics:
232
    """!
233
    @brief Class represents clustering algorithm OPTICS (Ordering Points To Identify Clustering Structure).
234
    @details OPTICS is a density-based algorithm. Purpose of the algorithm is to provide explicit clusters, but create clustering-ordering representation of the input data. 
235
             Clustering-ordering information contains information about internal structures of data set in terms of density and proper connectivity radius can be obtained
236
             for allocation required amount of clusters using this diagram. In case of usage additional input parameter 'amount of clusters' connectivity radius should be
237
             bigger than real - because it will be calculated by the algorithms if requested amount of clusters is not allocated.
238
239
    @image html optics_cluster_allocation.png "Scheme how does OPTICS works. At the beginning only one cluster is allocated, but two is requested. At the second step OPTICS calculates connectivity radius using cluster-ordering and performs final cluster allocation."
240
241
    Example:
242
    @code
243
        # Read sample for clustering from some file
244
        sample = read_sample(path_sample);
245
        
246
        # Create OPTICS algorithm for cluster analysis
247
        optics_instance = optics(sample, 0.5, 6);
248
        
249
        # Run cluster analysis
250
        optics_instance.process();
251
        
252
        # Obtain results of clustering
253
        clusters = optics_instance.get_clusters();
254
        noise = optics_instance.get_noise();
255
        
256
        # Obtain rechability-distances
257
        ordering = ordering_analyser(optics_instance.get_ordering());
258
        
259
        # Visualization of cluster ordering in line with reachability distance.
260
        ordering_visualizer.show_ordering_diagram(ordering);
261
    @endcode
262
    
263
    Amount of clusters that should be allocated can be also specified. In this case connectivity radius should be greater than real, for example:
264
    @code
265
        # Import required packages
266
        from pyclustering.cluster.optics import optics;
267
        from pyclustering.samples.definitions import FCPS_SAMPLES;
268
        from pyclustering.utils import read_sample;
269
        
270
        # Read sample for clustering from some file
271
        sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN);
272
        
273
        # Run cluster analysis where connvectivity radius is bigger than real
274
        radius = 2.0;
275
        neighbors = 3;
276
        amount_of_clusters = 3;
277
        
278
        optics_instance = optics(sample, radius, neighbors, amount_of_clusters);
279
        
280
        # Obtain results of clustering
281
        clusters = optics_instance.get_clusters();
282
        noise = optics_instance.get_noise();
283
    @endcode
284
    
285
    """
286
    
287
    def __init__(self, sample, eps, minpts, amount_clusters = None, ccore = False):
288
        """!
289
        @brief Constructor of clustering algorithm OPTICS.
290
        
291
        @param[in] sample (list): Input data that is presented as a list of points (objects), where each point is represented by list or tuple.
292
        @param[in] eps (double): Connectivity radius between points, points may be connected if distance between them less than the radius.
293
        @param[in] minpts (uint): Minimum number of shared neighbors that is required for establishing links between points.
294
        @param[in] amount_clusters (uint): Optional parameter where amount of clusters that should be allocated is specified.
295
                    In case of usage 'amount_clusters' connectivity radius can be greater than real, in other words, there is place for mistake
296
                    in connectivity radius usage.
297
        @param[in] ccore (bool): if True than DLL CCORE (C++ solution) will be used for solving the problem.
298
        
299
        """
300
        
301
        self.__sample_pointer = sample;     # Algorithm parameter - pointer to sample for processing.
302
        self.__eps = eps;                   # Algorithm parameter - connectivity radius between object for establish links between object.
303
        self.__minpts = minpts;             # Algorithm parameter - minimum number of neighbors that is required for establish links between object.
304
        self.__amount_clusters = amount_clusters;
305
        
306
        self.__ordering = None;
307
        self.__clusters = None;
308
        self.__noise = None;
309
        
310
        self.__ccore = ccore;
311
312
313
    def process(self):
314
        """!
315
        @brief Performs cluster analysis in line with rules of OPTICS algorithm.
316
        
317
        @remark Results of clustering can be obtained using corresponding gets methods.
318
        
319
        @see get_clusters()
320
        @see get_noise()
321
        @see get_ordering()
322
        
323
        """
324
        
325
        if (self.__ccore is True):
326
            (self.__clusters, self.__noise, self.__ordering, self.__eps) = wrapper.optics(self.__sample_pointer, self.__eps, self.__minpts, self.__amount_clusters);
327
        
328
        else:
329
            self.__allocate_clusters();
330
            
331
            if ( (self.__amount_clusters is not None) and (self.__amount_clusters != len(self.get_clusters())) ):
332
                analyser = ordering_analyser(self.get_ordering());
333
                radius = analyser.calculate_connvectivity_radius(self.__amount_clusters);
334
                if (radius is not None):
335
                    self.__eps = radius;
336
                    self.__allocate_clusters();
337
338
339
    def __initialize(self, sample):
340
        """!
341
        @brief Initializes internal states and resets clustering results in line with input sample.
342
        
343
        """
344
        
345
        self.__processed = [False] * len(sample);
346
        self.__optics_objects = [optics_descriptor(i) for i in range(len(sample))];     # List of OPTICS objects that corresponds to objects from input sample.
347
        self.__ordered_database = [];       # List of OPTICS objects in traverse order. 
348
        
349
        self.__clusters = None;     # Result of clustering (list of clusters where each cluster contains indexes of objects from input data).
350
        self.__noise = None;        # Result of clustering (noise).
351
352
353
    def __allocate_clusters(self):
354
        """!
355
        @brief Performs cluster allocation and builds ordering diagram that is based on reachability-distances.
356
        
357
        """
358
        
359
        self.__initialize(self.__sample_pointer);
360
        
361
        for optic_object in self.__optics_objects:
362
            if (optic_object.processed is False):
363
                self.__expand_cluster_order(optic_object);
364
        
365
        self.__extract_clusters();
366
    
367
    
368
    def get_clusters(self):
369
        """!
370
        @brief Returns list of allocated clusters, where each cluster contains indexes of objects and each cluster is represented by list.
371
        
372
        @return (list) List of allocated clusters.
373
        
374
        @see process()
375
        @see get_noise()
376
        @see get_ordering()
377
        @see get_radius()
378
        
379
        """
380
        
381
        return self.__clusters;
382
    
383
    
384
    def get_noise(self):
385
        """!
386
        @brief Returns list of noise that contains indexes of objects that corresponds to input data.
387
        
388
        @return (list) List of allocated noise objects.
389
        
390
        @see process()
391
        @see get_clusters()
392
        @see get_ordering()
393
        @see get_radius()
394
        
395
        """
396
        
397
        return self.__noise;
398
    
399
    
400
    def get_ordering(self):
401
        """!
402
        @brief Returns clustering ordering information about the input data set.
403
        @details Clustering ordering of data-set contains the information about the internal clustering structure in line with connectivity radius.
404
        
405
        @return (ordering_analyser) Analyser of clustering ordering.
406
        
407
        @see process()
408
        @see get_clusters()
409
        @see get_noise()
410
        @see get_radius()
411
        
412
        """
413
        
414
        if (self.__ordering is None):
415
            self.__ordering = [];
416
        
417
            for cluster in self.__clusters:
418
                for index_object in cluster:
419
                    optics_object = self.__optics_objects[index_object];
420
                    if (optics_object.reachability_distance is not None):
421
                        self.__ordering.append(optics_object.reachability_distance);
422
            
423
        return self.__ordering;
424
    
425
    
426
    def get_radius(self):
427
        """!
428
        @brief Returns connectivity radius that is calculated and used for clustering by the algorithm.
429
        @details Connectivity radius may be changed only in case of usage additional parameter of the algorithm - amount of clusters for allocation.
430
        
431
        @return (double) Connectivity radius.
432
        
433
        @see get_ordering()
434
        @see get_clusters()
435
        @see get_noise()
436
        
437
        """
438
        
439
        return self.__eps;
440
    
441
442
    def get_cluster_encoding(self):
443
        """!
444
        @brief Returns clustering result representation type that indicate how clusters are encoded.
445
        
446
        @return (type_encoding) Clustering result representation.
447
        
448
        @see get_clusters()
449
        
450
        """
451
        
452
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
453
454
455
    def __expand_cluster_order(self, optics_object):
456
        """!
457
        @brief Expand cluster order from not processed optic-object that corresponds to object from input data.
458
               Traverse procedure is performed until objects are reachable from core-objects in line with connectivity radius.
459
               Order database is updated during expanding.
460
               
461
        @param[in] optics_object (optics_descriptor): Object that hasn't been processed.
462
        
463
        """
464
        
465
        optics_object.processed = True;
466
        
467
        neighbors_descriptor = self.__neighbor_indexes(optics_object);
468
        optics_object.reachability_distance = None;
469
        
470
        self.__ordered_database.append(optics_object);
471
        
472
        # Check core distance
473
        if (len(neighbors_descriptor) >= self.__minpts):
474
            neighbors_descriptor.sort(key = lambda obj: obj[1]);
475
            optics_object.core_distance = neighbors_descriptor[self.__minpts - 1][1];
476
            
477
            # Continue processing
478
            order_seed = list();
479
            self.__update_order_seed(optics_object, neighbors_descriptor, order_seed);
480
            
481
            while(len(order_seed) > 0):
482
                optic_descriptor = order_seed[0];
483
                order_seed.remove(optic_descriptor);
484
                
485
                neighbors_descriptor = self.__neighbor_indexes(optic_descriptor);
486
                optic_descriptor.processed = True;
487
                
488
                self.__ordered_database.append(optic_descriptor);
489
                
490
                if (len(neighbors_descriptor) >= self.__minpts):
491
                    neighbors_descriptor.sort(key = lambda obj: obj[1]);
492
                    optic_descriptor.core_distance = neighbors_descriptor[self.__minpts - 1][1];
493
                    
494
                    self.__update_order_seed(optic_descriptor, neighbors_descriptor, order_seed);
495
                else:
496
                    optic_descriptor.core_distance = None;
497
                    
498
        else:
499
            optics_object.core_distance = None;
500
501
    
502
    def __extract_clusters(self):
503
        """!
504
        @brief Extract clusters and noise from order database.
505
        
506
        """
507
     
508
        self.__clusters = [];
509
        self.__noise = [];
510
511
        current_cluster = [];
512
        for optics_object in self.__ordered_database:
513
            if ((optics_object.reachability_distance is None) or (optics_object.reachability_distance > self.__eps)):
514
                if ((optics_object.core_distance is not None) and (optics_object.core_distance <= self.__eps)):
515
                    if (len(current_cluster) > 0):
516
                        self.__clusters.append(current_cluster);
517
                        current_cluster = [];
518
                        
519
                    current_cluster.append(optics_object.index_object);
520
                else:
521
                    self.__noise.append(optics_object.index_object);
522
            else:
523
                current_cluster.append(optics_object.index_object);
524
        
525
        if (len(current_cluster) > 0):
526
            self.__clusters.append(current_cluster);
527
                
528
529
    def __update_order_seed(self, optic_descriptor, neighbors_descriptors, order_seed):
530
        """!
531
        @brief Update sorted list of reachable objects (from core-object) that should be processed using neighbors of core-object.
532
        
533
        @param[in] optic_descriptor (optics_descriptor): Core-object whose neighbors should be analysed.
534
        @param[in] neighbors_descriptors (list): List of neighbors of core-object.
535
        @param[in|out] order_seed (list): List of sorted object in line with reachable distance.
536
        
537
        """
538
        
539
        for neighbor_descriptor in neighbors_descriptors:
540
            index_neighbor = neighbor_descriptor[0];
541
            current_reachable_distance = neighbor_descriptor[1];
542
            
543
            if (self.__optics_objects[index_neighbor].processed != True):
544
                reachable_distance = max(current_reachable_distance, optic_descriptor.core_distance);
545
                if (self.__optics_objects[index_neighbor].reachability_distance is None):
546
                    self.__optics_objects[index_neighbor].reachability_distance = reachable_distance;
547
                    
548
                    # insert element in queue O(n) - worst case.
549
                    index_insertion = len(order_seed);
550
                    for index_seed in range(0, len(order_seed)):
551
                        if (reachable_distance < order_seed[index_seed].reachability_distance):
552
                            index_insertion = index_seed;
553
                            break;
554
                    
555
                    order_seed.insert(index_insertion, self.__optics_objects[index_neighbor]);
556
557
                else:
558
                    if (reachable_distance < self.__optics_objects[index_neighbor].reachability_distance):
559
                        self.__optics_objects[index_neighbor].reachability_distance = reachable_distance;
560
                        order_seed.sort(key = lambda obj: obj.reachability_distance);
561
562
563
    def __neighbor_indexes(self, optic_object):
564
        """!
565
        @brief Return list of indexes of neighbors of specified point for the data.
566
        
567
        @param[in] optic_object (optics_descriptor): Object for which neighbors should be returned in line with connectivity radius.
568
        
569
        @return (list) List of indexes of neighbors in line the connectivity radius.
570
        
571
        """
572
              
573
        neighbor_description = [];
574
        
575
        for index in range(0, len(self.__sample_pointer), 1):
576
            if (index == optic_object.index_object):
577
                continue;
578
            
579
            distance = euclidean_distance(self.__sample_pointer[optic_object.index_object], self.__sample_pointer[index]);
580
            if (distance <= self.__eps):
581
                neighbor_description.append( [index, distance] );
582
            
583
        return neighbor_description;