Completed
Push — master ( 32ea1c...e48e70 )
by Andrei
01:23
created

ordering_analyser.cluster_ordering()   A

Complexity

Conditions 1

Size

Total Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
dl 0
loc 7
rs 9.4285
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: OPTICS (Ordering Points To Identify Clustering Structure)
4
@details Based on article description:
5
         - M.Ankerst, M.Breunig, H.Kriegel, J.Sander. OPTICS: Ordering Points To Identify the Clustering Structure. 1999.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2017
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
from pyclustering.cluster.encoder import type_encoding;
30
31
from pyclustering.utils import euclidean_distance;
32
33
import matplotlib.pyplot as plt;
0 ignored issues
show
Configuration introduced by
The import matplotlib.pyplot could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
34
35
import pyclustering.core.optics_wrapper as wrapper;
36
37
38
class ordering_visualizer:
39
    """!
40
    @brief Cluster ordering diagram visualizer that represents dataset graphically as density-based clustering structure.
41
    
42
    @see ordering_analyser
43
    
44
    """
45
    
46
    @staticmethod
47
    def show_ordering_diagram(analyser, amount_clusters = None):
48
        """!
49
        @brief Display cluster-ordering diagram.
50
        
51
        @param[in] analyser (ordering_analyser): cluster-ordering analyser whose ordering diagram should be displayed.
52
        @param[in] amount_clusters (uint): if it is not 'None' then it displays connectivity radius line that can used for allocation of specified amount of clusters.
53
        
54
        Example demonstrates general abilities of 'ordering_visualizer' class:
55
        @code
56
        # Display cluster-ordering diagram with connectivity radius is used for allocation of three clusters.
57
        ordering_visualizer.show_ordering_diagram(analyser, 3);
58
        
59
        # Display cluster-ordering diagram without radius.
60
        ordering_visualizer.show_ordering_diagram(analyser);
61
        @endcode
62
        
63
        """
64
        ordering = analyser.cluster_ordering;
65
        indexes = [i for i in range(0, len(ordering))];
66
        
67
        axis = plt.subplot(111);
68
        axis.bar(indexes, ordering, color = 'black');
69
        plt.xlim([0, len(ordering)]);
70
        
71
        if (amount_clusters is not None):
72
            radius = analyser.calculate_connvectivity_radius(amount_clusters);
73
            plt.axhline(y = analyser.calculate_connvectivity_radius(amount_clusters), linewidth = 2, color = 'b');
74
            plt.text(0, radius + radius * 0.03, " Radius:   " + str(round(radius, 4)) + ";\n Clusters: " + str(amount_clusters), color = 'b', fontsize = 10);
75
        
76
        plt.show();
77
78
79
class ordering_analyser:
80
    """!
81
    @brief Analyser of cluster ordering diagram.
82
    @details Using cluster-ordering it is able to connectivity radius for allocation of specified amount of clusters and
83
              calculate amount of clusters using specified connectivity radius. Cluster-ordering is formed by OPTICS algorithm
84
              during cluster analysis.
85
    
86
    @see optics
87
    
88
    """
89
    
90
    @property
91
    def cluster_ordering(self):
92
        """!
93
        @brief (list) Returns values of dataset cluster ordering.
94
        
95
        """
96
        return self.__ordering;
97
    
98
    
99
    def __init__(self, ordering_diagram):
100
        """!
101
        @brief Analyser of ordering diagram that is based on reachability-distances.
102
        
103
        @see calculate_connvectivity_radius
104
        
105
        """
106
        self.__ordering = ordering_diagram;
107
    
108
    
109
    def __len__(self):
110
        """!
111
        @brief Returns length of clustering-ordering diagram.
112
        
113
        """
114
        return len(self.__ordering);
115
    
116
    
117
    def calculate_connvectivity_radius(self, amount_clusters, maximum_iterations = 100):
118
        """!
119
        @brief Calculates connectivity radius of allocation specified amount of clusters using ordering diagram.
120
        
121
        @param[in] amount_clusters (uint): amount of clusters that should be allocated by calculated connectivity radius.
122
        @param[in] maximum_iterations (uint): maximum number of iteration for searching connectivity radius to allocated specified amount of clusters (by default it is restricted by 100 iterations).
123
        
124
        @return (double) Value of connectivity radius, it may be 'None' if connectivity radius hasn't been found for the specified amount of iterations.
125
        
126
        """
127
        
128
        maximum_distance = max(self.__ordering);
129
        
130
        upper_distance = maximum_distance;
131
        lower_distance = 0.0;
132
        
133
        radius = None;
134
        
135
        if (self.extract_cluster_amount(maximum_distance) <= amount_clusters):
136
            for _ in range(maximum_iterations):
137
                radius = (lower_distance + upper_distance) / 2.0;
138
                
139
                amount = self.extract_cluster_amount(radius);
140
                if ( (amount == amount_clusters) or (amount_clusters == 0)):
141
                    break;
142
                
143
                elif (amount > amount_clusters):
144
                    lower_distance = radius;
145
                
146
                elif (amount < amount_clusters):
147
                    upper_distance = radius;
148
        
149
        return radius;
150
    
151
    
152
    def extract_cluster_amount(self, radius):
153
        """!
154
        @brief Obtains amount of clustering that can be allocated by using specified radius for ordering diagram.
155
        @details When growth of reachability-distances is detected than it is considered as a start point of cluster, 
156
                 than pick is detected and after that recession is observed until new growth (that means end of the
157
                 current cluster and start of a new one) or end of diagram.
158
        
159
        @param[in] radius (double): connectivity radius that is used for cluster allocation.
160
        
161
        
162
        @return (unit) Amount of clusters that can be allocated by the connectivity radius on ordering diagram.
163
        
164
        """
165
        
166
        amount_clusters = 1;
167
        
168
        cluster_start = False;
169
        cluster_pick = False;
170
        total_similarity = True;
171
        previous_cluster_distance = None;
172
        previous_distance = None;
173
        
174
        for distance in self.__ordering:
175
            if (distance >= radius):
176
                if (cluster_start is False):
177
                    cluster_start = True;
178
                    amount_clusters += 1;
179
                
180
                else:
181
                    if ((distance < previous_cluster_distance) and (cluster_pick is False)):
182
                        cluster_pick = True;
183
                    
184
                    elif ((distance > previous_cluster_distance) and (cluster_pick is True)):
185
                        cluster_pick = False;
186
                        amount_clusters += 1;
187
                
188
                previous_cluster_distance = distance;
189
            
190
            else:
191
                cluster_start = False;
192
                cluster_pick = False;
193
            
194
            if ( (previous_distance is not None) and (distance != previous_distance) ):
195
                total_similarity = False;
196
            
197
            previous_distance = distance;
198
        
199
        if ( (total_similarity is True) and (previous_distance > radius) ):
200
            amount_clusters = 0;
201
        
202
        return amount_clusters;
203
204
205
class optics_descriptor:
206
    """!
207
    @brief Object description that used by OPTICS algorithm for cluster analysis.
208
    
209
    """
210
211
    def __init__(self, index, core_distance = None, reachability_distance = None):
212
        """!
213
        @brief Constructor of object description in optics terms.
214
        
215
        @param[in] index (uint): Index of the object in the data set.
216
        @param[in] core_distance (double): Core distance that is minimum distance to specified number of neighbors.
217
        @param[in] reachability_distance (double): Reachability distance to this object.
218
        
219
        """
220
        
221
        ## Reachability distance - the smallest distance to be reachable by core object.
222
        self.index_object = index;
223
        
224
        ## Core distance - the smallest distance to reach specified number of neighbors that is not greater then connectivity radius.
225
        self.core_distance = core_distance;
226
        
227
        ## Index of object from the input data.
228
        self.reachability_distance = reachability_distance;
229
        
230
        ## True is object has been already traversed.
231
        self.processed = False;
232
233
    def __repr__(self):
234
        """!
235
        @brief Returns string representation of the optics descriptor.
236
        
237
        """
238
        
239
        return '(%s, [c: %s, r: %s])' % (self.index_object, self.core_distance, self.reachability_distance);
240
241
242
class optics:
243
    """!
244
    @brief Class represents clustering algorithm OPTICS (Ordering Points To Identify Clustering Structure).
245
    @details OPTICS is a density-based algorithm. Purpose of the algorithm is to provide explicit clusters, but create clustering-ordering representation of the input data. 
246
             Clustering-ordering information contains information about internal structures of data set in terms of density and proper connectivity radius can be obtained
247
             for allocation required amount of clusters using this diagram. In case of usage additional input parameter 'amount of clusters' connectivity radius should be
248
             bigger than real - because it will be calculated by the algorithms if requested amount of clusters is not allocated.
249
250
    @image html optics_cluster_allocation.png "Scheme how does OPTICS works. At the beginning only one cluster is allocated, but two is requested. At the second step OPTICS calculates connectivity radius using cluster-ordering and performs final cluster allocation."
251
252
    Example:
253
    @code
254
        # Read sample for clustering from some file
255
        sample = read_sample(path_sample);
256
        
257
        # Create OPTICS algorithm for cluster analysis
258
        optics_instance = optics(sample, 0.5, 6);
259
        
260
        # Run cluster analysis
261
        optics_instance.process();
262
        
263
        # Obtain results of clustering
264
        clusters = optics_instance.get_clusters();
265
        noise = optics_instance.get_noise();
266
        
267
        # Obtain rechability-distances
268
        ordering = ordering_analyser(optics_instance.get_ordering());
269
        
270
        # Visualization of cluster ordering in line with reachability distance.
271
        ordering_visualizer.show_ordering_diagram(ordering);
272
    @endcode
273
    
274
    Amount of clusters that should be allocated can be also specified. In this case connectivity radius should be greater than real, for example:
275
    @code
276
        # Import required packages
277
        from pyclustering.cluster.optics import optics;
278
        from pyclustering.samples.definitions import FCPS_SAMPLES;
279
        from pyclustering.utils import read_sample;
280
        
281
        # Read sample for clustering from some file
282
        sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN);
283
        
284
        # Run cluster analysis where connvectivity radius is bigger than real
285
        radius = 2.0;
286
        neighbors = 3;
287
        amount_of_clusters = 3;
288
        
289
        optics_instance = optics(sample, radius, neighbors, amount_of_clusters);
290
        
291
        # Obtain results of clustering
292
        clusters = optics_instance.get_clusters();
293
        noise = optics_instance.get_noise();
294
    @endcode
295
    
296
    """
297
    
298
    def __init__(self, sample, eps, minpts, amount_clusters = None, ccore = False):
299
        """!
300
        @brief Constructor of clustering algorithm OPTICS.
301
        
302
        @param[in] sample (list): Input data that is presented as a list of points (objects), where each point is represented by list or tuple.
303
        @param[in] eps (double): Connectivity radius between points, points may be connected if distance between them less than the radius.
304
        @param[in] minpts (uint): Minimum number of shared neighbors that is required for establishing links between points.
305
        @param[in] amount_clusters (uint): Optional parameter where amount of clusters that should be allocated is specified.
306
                    In case of usage 'amount_clusters' connectivity radius can be greater than real, in other words, there is place for mistake
307
                    in connectivity radius usage.
308
        @param[in] ccore (bool): if True than DLL CCORE (C++ solution) will be used for solving the problem.
309
        
310
        """
311
        
312
        self.__sample_pointer = sample;     # Algorithm parameter - pointer to sample for processing.
313
        self.__eps = eps;                   # Algorithm parameter - connectivity radius between object for establish links between object.
314
        self.__minpts = minpts;             # Algorithm parameter - minimum number of neighbors that is required for establish links between object.
315
        self.__amount_clusters = amount_clusters;
316
        
317
        self.__ordering = None;
318
        self.__clusters = None;
319
        self.__noise = None;
320
        
321
        self.__ccore = ccore;
322
323
324
    def process(self):
325
        """!
326
        @brief Performs cluster analysis in line with rules of OPTICS algorithm.
327
        
328
        @remark Results of clustering can be obtained using corresponding gets methods.
329
        
330
        @see get_clusters()
331
        @see get_noise()
332
        @see get_ordering()
333
        
334
        """
335
        
336
        if (self.__ccore is True):
337
            (self.__clusters, self.__noise, self.__ordering, self.__eps) = wrapper.optics(self.__sample_pointer, self.__eps, self.__minpts, self.__amount_clusters);
338
        
339
        else:
340
            self.__allocate_clusters();
341
            
342
            if ( (self.__amount_clusters is not None) and (self.__amount_clusters != len(self.get_clusters())) ):
343
                analyser = ordering_analyser(self.get_ordering());
344
                radius = analyser.calculate_connvectivity_radius(self.__amount_clusters);
345
                if (radius is not None):
346
                    self.__eps = radius;
347
                    self.__allocate_clusters();
348
349
350
    def __initialize(self, sample):
351
        """!
352
        @brief Initializes internal states and resets clustering results in line with input sample.
353
        
354
        """
355
        
356
        self.__processed = [False] * len(sample);
357
        self.__optics_objects = [optics_descriptor(i) for i in range(len(sample))];     # List of OPTICS objects that corresponds to objects from input sample.
358
        self.__ordered_database = [];       # List of OPTICS objects in traverse order. 
359
        
360
        self.__clusters = None;     # Result of clustering (list of clusters where each cluster contains indexes of objects from input data).
361
        self.__noise = None;        # Result of clustering (noise).
362
363
364
    def __allocate_clusters(self):
365
        """!
366
        @brief Performs cluster allocation and builds ordering diagram that is based on reachability-distances.
367
        
368
        """
369
        
370
        self.__initialize(self.__sample_pointer);
371
        
372
        for optic_object in self.__optics_objects:
373
            if (optic_object.processed is False):
374
                self.__expand_cluster_order(optic_object);
375
        
376
        self.__extract_clusters();
377
    
378
    
379
    def get_clusters(self):
380
        """!
381
        @brief Returns list of allocated clusters, where each cluster contains indexes of objects and each cluster is represented by list.
382
        
383
        @return (list) List of allocated clusters.
384
        
385
        @see process()
386
        @see get_noise()
387
        @see get_ordering()
388
        @see get_radius()
389
        
390
        """
391
        
392
        return self.__clusters;
393
    
394
    
395
    def get_noise(self):
396
        """!
397
        @brief Returns list of noise that contains indexes of objects that corresponds to input data.
398
        
399
        @return (list) List of allocated noise objects.
400
        
401
        @see process()
402
        @see get_clusters()
403
        @see get_ordering()
404
        @see get_radius()
405
        
406
        """
407
        
408
        return self.__noise;
409
    
410
    
411
    def get_ordering(self):
412
        """!
413
        @brief Returns clustering ordering information about the input data set.
414
        @details Clustering ordering of data-set contains the information about the internal clustering structure in line with connectivity radius.
415
        
416
        @return (ordering_analyser) Analyser of clustering ordering.
417
        
418
        @see process()
419
        @see get_clusters()
420
        @see get_noise()
421
        @see get_radius()
422
        
423
        """
424
        
425
        if (self.__ordering is None):
426
            self.__ordering = [];
427
        
428
            for cluster in self.__clusters:
429
                for index_object in cluster:
430
                    optics_object = self.__optics_objects[index_object];
431
                    if (optics_object.reachability_distance is not None):
432
                        self.__ordering.append(optics_object.reachability_distance);
433
            
434
        return self.__ordering;
435
    
436
    
437
    def get_radius(self):
438
        """!
439
        @brief Returns connectivity radius that is calculated and used for clustering by the algorithm.
440
        @details Connectivity radius may be changed only in case of usage additional parameter of the algorithm - amount of clusters for allocation.
441
        
442
        @return (double) Connectivity radius.
443
        
444
        @see get_ordering()
445
        @see get_clusters()
446
        @see get_noise()
447
        
448
        """
449
        
450
        return self.__eps;
451
    
452
453
    def get_cluster_encoding(self):
454
        """!
455
        @brief Returns clustering result representation type that indicate how clusters are encoded.
456
        
457
        @return (type_encoding) Clustering result representation.
458
        
459
        @see get_clusters()
460
        
461
        """
462
        
463
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
464
465
466
    def __expand_cluster_order(self, optics_object):
467
        """!
468
        @brief Expand cluster order from not processed optic-object that corresponds to object from input data.
469
               Traverse procedure is performed until objects are reachable from core-objects in line with connectivity radius.
470
               Order database is updated during expanding.
471
               
472
        @param[in] optics_object (optics_descriptor): Object that hasn't been processed.
473
        
474
        """
475
        
476
        optics_object.processed = True;
477
        
478
        neighbors_descriptor = self.__neighbor_indexes(optics_object);
479
        optics_object.reachability_distance = None;
480
        
481
        self.__ordered_database.append(optics_object);
482
        
483
        # Check core distance
484
        if (len(neighbors_descriptor) >= self.__minpts):
485
            neighbors_descriptor.sort(key = lambda obj: obj[1]);
486
            optics_object.core_distance = neighbors_descriptor[self.__minpts - 1][1];
487
            
488
            # Continue processing
489
            order_seed = list();
490
            self.__update_order_seed(optics_object, neighbors_descriptor, order_seed);
491
            
492
            while(len(order_seed) > 0):
493
                optic_descriptor = order_seed[0];
494
                order_seed.remove(optic_descriptor);
495
                
496
                neighbors_descriptor = self.__neighbor_indexes(optic_descriptor);
497
                optic_descriptor.processed = True;
498
                
499
                self.__ordered_database.append(optic_descriptor);
500
                
501
                if (len(neighbors_descriptor) >= self.__minpts):
502
                    neighbors_descriptor.sort(key = lambda obj: obj[1]);
503
                    optic_descriptor.core_distance = neighbors_descriptor[self.__minpts - 1][1];
504
                    
505
                    self.__update_order_seed(optic_descriptor, neighbors_descriptor, order_seed);
506
                else:
507
                    optic_descriptor.core_distance = None;
508
                    
509
        else:
510
            optics_object.core_distance = None;
511
512
    
513
    def __extract_clusters(self):
514
        """!
515
        @brief Extract clusters and noise from order database.
516
        
517
        """
518
     
519
        self.__clusters = [];
520
        self.__noise = [];
521
522
        current_cluster = [];
523
        for optics_object in self.__ordered_database:
524
            if ((optics_object.reachability_distance is None) or (optics_object.reachability_distance > self.__eps)):
525
                if ((optics_object.core_distance is not None) and (optics_object.core_distance <= self.__eps)):
526
                    if (len(current_cluster) > 0):
527
                        self.__clusters.append(current_cluster);
528
                        current_cluster = [];
529
                        
530
                    current_cluster.append(optics_object.index_object);
531
                else:
532
                    self.__noise.append(optics_object.index_object);
533
            else:
534
                current_cluster.append(optics_object.index_object);
535
        
536
        if (len(current_cluster) > 0):
537
            self.__clusters.append(current_cluster);
538
                
539
540
    def __update_order_seed(self, optic_descriptor, neighbors_descriptors, order_seed):
541
        """!
542
        @brief Update sorted list of reachable objects (from core-object) that should be processed using neighbors of core-object.
543
        
544
        @param[in] optic_descriptor (optics_descriptor): Core-object whose neighbors should be analysed.
545
        @param[in] neighbors_descriptors (list): List of neighbors of core-object.
546
        @param[in|out] order_seed (list): List of sorted object in line with reachable distance.
547
        
548
        """
549
        
550
        for neighbor_descriptor in neighbors_descriptors:
551
            index_neighbor = neighbor_descriptor[0];
552
            current_reachable_distance = neighbor_descriptor[1];
553
            
554
            if (self.__optics_objects[index_neighbor].processed != True):
555
                reachable_distance = max(current_reachable_distance, optic_descriptor.core_distance);
556
                if (self.__optics_objects[index_neighbor].reachability_distance is None):
557
                    self.__optics_objects[index_neighbor].reachability_distance = reachable_distance;
558
                    
559
                    # insert element in queue O(n) - worst case.
560
                    index_insertion = len(order_seed);
561
                    for index_seed in range(0, len(order_seed)):
562
                        if (reachable_distance < order_seed[index_seed].reachability_distance):
563
                            index_insertion = index_seed;
564
                            break;
565
                    
566
                    order_seed.insert(index_insertion, self.__optics_objects[index_neighbor]);
567
568
                else:
569
                    if (reachable_distance < self.__optics_objects[index_neighbor].reachability_distance):
570
                        self.__optics_objects[index_neighbor].reachability_distance = reachable_distance;
571
                        order_seed.sort(key = lambda obj: obj.reachability_distance);
572
573
574
    def __neighbor_indexes(self, optic_object):
575
        """!
576
        @brief Return list of indexes of neighbors of specified point for the data.
577
        
578
        @param[in] optic_object (optics_descriptor): Object for which neighbors should be returned in line with connectivity radius.
579
        
580
        @return (list) List of indexes of neighbors in line the connectivity radius.
581
        
582
        """
583
              
584
        neighbor_description = [];
585
        
586
        for index in range(0, len(self.__sample_pointer), 1):
587
            if (index == optic_object.index_object):
588
                continue;
589
            
590
            distance = euclidean_distance(self.__sample_pointer[optic_object.index_object], self.__sample_pointer[index]);
591
            if (distance <= self.__eps):
592
                neighbor_description.append( [index, distance] );
593
            
594
        return neighbor_description;