Completed
Push — 0.7.dev ( e8eda8...b87d98 )
by Andrei
58s
created

optics.__extract_clusters()   B

Complexity

Conditions 6

Size

Total Lines 19

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 6
dl 0
loc 19
rs 8
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: OPTICS (Ordering Points To Identify Clustering Structure)
4
@details Based on article description:
5
         - M.Ankerst, M.Breunig, H.Kriegel, J.Sander. OPTICS: Ordering Points To Identify the Clustering Structure. 1999.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2017
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
from pyclustering.cluster.encoder import type_encoding;
30
31
from pyclustering.utils import euclidean_distance;
32
from pyclustering.utils.color import color as color_list;
33
34
import matplotlib.pyplot as plt;
0 ignored issues
show
Configuration introduced by
The import matplotlib.pyplot could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
35
36
import pyclustering.core.optics_wrapper as wrapper;
37
38
39
class ordering_visualizer:
40
    """!
41
    @brief Cluster ordering diagram visualizer that represents dataset graphically as density-based clustering structure.
42
    
43
    @see ordering_analyser
44
    
45
    """
46
    
47
    @staticmethod
48
    def show_ordering_diagram(analyser, amount_clusters = None):
49
        """!
50
        @brief Display cluster-ordering (reachability-plot) diagram.
51
        
52
        @param[in] analyser (ordering_analyser): cluster-ordering analyser whose ordering diagram should be displayed.
53
        @param[in] amount_clusters (uint): if it is not 'None' then it displays connectivity radius line that can used for allocation of specified amount of clusters
54
                    and colorize diagram by corresponding cluster colors.
55
        
56
        Example demonstrates general abilities of 'ordering_visualizer' class:
57
        @code
58
        # Display cluster-ordering diagram with connectivity radius is used for allocation of three clusters.
59
        ordering_visualizer.show_ordering_diagram(analyser, 3);
60
        
61
        # Display cluster-ordering diagram without radius.
62
        ordering_visualizer.show_ordering_diagram(analyser);
63
        @endcode
64
        
65
        """
66
        ordering = analyser.cluster_ordering;
67
        axis = plt.subplot(111);
68
        
69
        if (amount_clusters is not None):
70
            radius, borders = analyser.calculate_connvectivity_radius(amount_clusters);
71
        
72
            # divide into cluster groups to visualize by colors
73
            left_index_border = 0;
74
            current_index_border = 0;
75
            for index_border in range(len(borders)):
76
                right_index_border = borders[index_border];
77
                axis.bar(range(left_index_border, right_index_border), ordering[left_index_border:right_index_border], width = 1.0, color = color_list.TITLES[index_border]);
78
                left_index_border = right_index_border;
79
                current_index_border = index_border;
80
            
81
            axis.bar(range(left_index_border, len(ordering)), ordering[left_index_border:len(ordering)], width = 1.0, color = color_list.TITLES[current_index_border + 1]);
82
            
83
            plt.xlim([0, len(ordering)]);
84
            
85
            plt.axhline(y = radius, linewidth = 2, color = 'black');
86
            plt.text(0, radius + radius * 0.03, " Radius:   " + str(round(radius, 4)) + ";\n Clusters: " + str(amount_clusters), color = 'b', fontsize = 10);
87
            
88
        else:
89
            axis.bar(range(0, len(ordering)), ordering[0:len(ordering)], width = 1.0, color = 'black');
90
            plt.xlim([0, len(ordering)]);
91
        
92
        plt.show();
93
94
95
class ordering_analyser:
96
    """!
97
    @brief Analyser of cluster ordering diagram.
98
    @details Using cluster-ordering it is able to connectivity radius for allocation of specified amount of clusters and
99
              calculate amount of clusters using specified connectivity radius. Cluster-ordering is formed by OPTICS algorithm
100
              during cluster analysis.
101
    
102
    @see optics
103
    
104
    """
105
    
106
    @property
107
    def cluster_ordering(self):
108
        """!
109
        @brief (list) Returns values of dataset cluster ordering.
110
        
111
        """
112
        return self.__ordering;
113
    
114
    
115
    def __init__(self, ordering_diagram):
116
        """!
117
        @brief Analyser of ordering diagram that is based on reachability-distances.
118
        
119
        @see calculate_connvectivity_radius
120
        
121
        """
122
        self.__ordering = ordering_diagram;
123
    
124
    
125
    def __len__(self):
126
        """!
127
        @brief Returns length of clustering-ordering diagram.
128
        
129
        """
130
        return len(self.__ordering);
131
    
132
    
133
    def calculate_connvectivity_radius(self, amount_clusters, maximum_iterations = 100):
134
        """!
135
        @brief Calculates connectivity radius of allocation specified amount of clusters using ordering diagram and marks borders of clusters using indexes of values of ordering diagram.
136
        @details Parameter 'maximum_iterations' is used to protect from hanging when it is impossible to allocate specified number of clusters.
137
        
138
        @param[in] amount_clusters (uint): amount of clusters that should be allocated by calculated connectivity radius.
139
        @param[in] maximum_iterations (uint): maximum number of iteration for searching connectivity radius to allocated specified amount of clusters (by default it is restricted by 100 iterations).
140
        
141
        @return (double, list) Value of connectivity radius and borders of clusters like (radius, borders), radius may be 'None' as well as borders may be '[]'
142
                                if connectivity radius hasn't been found for the specified amount of iterations.
143
        
144
        """
145
        
146
        maximum_distance = max(self.__ordering);
147
        
148
        upper_distance = maximum_distance;
149
        lower_distance = 0.0;
150
        
151
        radius = None;
152
        result = None;
153
        
154
        amount, borders = self.extract_cluster_amount(maximum_distance);
155
        if (amount <= amount_clusters):
156
            for _ in range(maximum_iterations):
157
                radius = (lower_distance + upper_distance) / 2.0;
158
                
159
                amount, borders = self.extract_cluster_amount(radius);
160
                if (amount == amount_clusters):
161
                    result = radius;
162
                    break;
163
                
164
                elif (amount == 0):
165
                    break;
166
                
167
                elif (amount > amount_clusters):
168
                    lower_distance = radius;
169
                
170
                elif (amount < amount_clusters):
171
                    upper_distance = radius;
172
        
173
        return result, borders;
174
    
175
    
176
    def extract_cluster_amount(self, radius):
177
        """!
178
        @brief Obtains amount of clustering that can be allocated by using specified radius for ordering diagram and borders between them.
179
        @details When growth of reachability-distances is detected than it is considered as a start point of cluster, 
180
                 than pick is detected and after that recession is observed until new growth (that means end of the
181
                 current cluster and start of a new one) or end of diagram.
182
        
183
        @param[in] radius (double): connectivity radius that is used for cluster allocation.
184
        
185
        @return (unit, list) Amount of clusters that can be allocated by the connectivity radius on ordering diagram and borders between them using indexes
186
                 from ordering diagram (amount_clusters, border_clusters).
187
        
188
        """
189
        
190
        amount_clusters = 1;
191
        
192
        cluster_start = False;
193
        cluster_pick = False;
194
        total_similarity = True;
195
        previous_cluster_distance = None;
196
        previous_distance = None;
197
        
198
        cluster_borders = [];
199
        
200
        for index_ordering in range(len(self.__ordering)):
201
            distance = self.__ordering[index_ordering];
202
            if (distance >= radius):
203
                if (cluster_start is False):
204
                    cluster_start = True;
205
                    amount_clusters += 1;
206
                    
207
                    if (index_ordering != 0):
208
                        cluster_borders.append(index_ordering);
209
                
210
                else:
211
                    if ((distance < previous_cluster_distance) and (cluster_pick is False)):
212
                        cluster_pick = True;
213
                    
214
                    elif ((distance > previous_cluster_distance) and (cluster_pick is True)):
215
                        cluster_pick = False;
216
                        amount_clusters += 1;
217
                        
218
                        if (index_ordering != 0):
219
                            cluster_borders.append(index_ordering);
220
                
221
                previous_cluster_distance = distance;
222
            
223
            else:
224
                cluster_start = False;
225
                cluster_pick = False;
226
            
227
            if ( (previous_distance is not None) and (distance != previous_distance) ):
228
                total_similarity = False;
229
            
230
            previous_distance = distance;
231
        
232
        if ( (total_similarity is True) and (previous_distance > radius) ):
233
            amount_clusters = 0;
234
235
        return amount_clusters, cluster_borders;
236
237
238
class optics_descriptor:
239
    """!
240
    @brief Object description that used by OPTICS algorithm for cluster analysis.
241
    
242
    """
243
244
    def __init__(self, index, core_distance = None, reachability_distance = None):
245
        """!
246
        @brief Constructor of object description in optics terms.
247
        
248
        @param[in] index (uint): Index of the object in the data set.
249
        @param[in] core_distance (double): Core distance that is minimum distance to specified number of neighbors.
250
        @param[in] reachability_distance (double): Reachability distance to this object.
251
        
252
        """
253
        
254
        ## Reachability distance - the smallest distance to be reachable by core object.
255
        self.index_object = index;
256
        
257
        ## Core distance - the smallest distance to reach specified number of neighbors that is not greater then connectivity radius.
258
        self.core_distance = core_distance;
259
        
260
        ## Index of object from the input data.
261
        self.reachability_distance = reachability_distance;
262
        
263
        ## True is object has been already traversed.
264
        self.processed = False;
265
266
    def __repr__(self):
267
        """!
268
        @brief Returns string representation of the optics descriptor.
269
        
270
        """
271
        
272
        return '(%s, [c: %s, r: %s])' % (self.index_object, self.core_distance, self.reachability_distance);
273
274
275
class optics:
276
    """!
277
    @brief Class represents clustering algorithm OPTICS (Ordering Points To Identify Clustering Structure).
278
    @details OPTICS is a density-based algorithm. Purpose of the algorithm is to provide explicit clusters, but create clustering-ordering representation of the input data. 
279
             Clustering-ordering information contains information about internal structures of data set in terms of density and proper connectivity radius can be obtained
280
             for allocation required amount of clusters using this diagram. In case of usage additional input parameter 'amount of clusters' connectivity radius should be
281
             bigger than real - because it will be calculated by the algorithms if requested amount of clusters is not allocated.
282
283
    @image html optics_example_clustering.png "Scheme how does OPTICS works. At the beginning only one cluster is allocated, but two is requested. At the second step OPTICS calculates connectivity radius using cluster-ordering and performs final cluster allocation."
284
285
    Example:
286
    @code
287
        # Read sample for clustering from some file
288
        sample = read_sample(path_sample);
289
        
290
        # Create OPTICS algorithm for cluster analysis
291
        optics_instance = optics(sample, 0.5, 6);
292
        
293
        # Run cluster analysis
294
        optics_instance.process();
295
        
296
        # Obtain results of clustering
297
        clusters = optics_instance.get_clusters();
298
        noise = optics_instance.get_noise();
299
        
300
        # Obtain rechability-distances
301
        ordering = ordering_analyser(optics_instance.get_ordering());
302
        
303
        # Visualization of cluster ordering in line with reachability distance.
304
        ordering_visualizer.show_ordering_diagram(ordering);
305
    @endcode
306
    
307
    Amount of clusters that should be allocated can be also specified. In this case connectivity radius should be greater than real, for example:
308
    @code
309
        # Import required packages
310
        from pyclustering.cluster.optics import optics;
311
        from pyclustering.samples.definitions import FCPS_SAMPLES;
312
        from pyclustering.utils import read_sample;
313
        
314
        # Read sample for clustering from some file
315
        sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN);
316
        
317
        # Run cluster analysis where connvectivity radius is bigger than real
318
        radius = 2.0;
319
        neighbors = 3;
320
        amount_of_clusters = 3;
321
        
322
        optics_instance = optics(sample, radius, neighbors, amount_of_clusters);
323
        
324
        # Obtain results of clustering
325
        clusters = optics_instance.get_clusters();
326
        noise = optics_instance.get_noise();
327
    @endcode
328
    
329
    """
330
    
331
    def __init__(self, sample, eps, minpts, amount_clusters = None, ccore = False):
332
        """!
333
        @brief Constructor of clustering algorithm OPTICS.
334
        
335
        @param[in] sample (list): Input data that is presented as a list of points (objects), where each point is represented by list or tuple.
336
        @param[in] eps (double): Connectivity radius between points, points may be connected if distance between them less than the radius.
337
        @param[in] minpts (uint): Minimum number of shared neighbors that is required for establishing links between points.
338
        @param[in] amount_clusters (uint): Optional parameter where amount of clusters that should be allocated is specified.
339
                    In case of usage 'amount_clusters' connectivity radius can be greater than real, in other words, there is place for mistake
340
                    in connectivity radius usage.
341
        @param[in] ccore (bool): if True than DLL CCORE (C++ solution) will be used for solving the problem.
342
        
343
        """
344
        
345
        self.__sample_pointer = sample;     # Algorithm parameter - pointer to sample for processing.
346
        self.__eps = eps;                   # Algorithm parameter - connectivity radius between object for establish links between object.
347
        self.__minpts = minpts;             # Algorithm parameter - minimum number of neighbors that is required for establish links between object.
348
        self.__amount_clusters = amount_clusters;
349
        
350
        self.__ordering = None;
351
        self.__clusters = None;
352
        self.__noise = None;
353
        
354
        self.__ccore = ccore;
355
356
357
    def process(self):
358
        """!
359
        @brief Performs cluster analysis in line with rules of OPTICS algorithm.
360
        
361
        @remark Results of clustering can be obtained using corresponding gets methods.
362
        
363
        @see get_clusters()
364
        @see get_noise()
365
        @see get_ordering()
366
        
367
        """
368
        
369
        if (self.__ccore is True):
370
            (self.__clusters, self.__noise, self.__ordering, self.__eps) = wrapper.optics(self.__sample_pointer, self.__eps, self.__minpts, self.__amount_clusters);
371
        
372
        else:
373
            self.__allocate_clusters();
374
            
375
            if ( (self.__amount_clusters is not None) and (self.__amount_clusters != len(self.get_clusters())) ):
376
                analyser = ordering_analyser(self.get_ordering());
377
                radius, _ = analyser.calculate_connvectivity_radius(self.__amount_clusters);
378
                if (radius is not None):
379
                    self.__eps = radius;
380
                    self.__allocate_clusters();
381
382
383
    def __initialize(self, sample):
384
        """!
385
        @brief Initializes internal states and resets clustering results in line with input sample.
386
        
387
        """
388
        
389
        self.__processed = [False] * len(sample);
390
        self.__optics_objects = [optics_descriptor(i) for i in range(len(sample))];     # List of OPTICS objects that corresponds to objects from input sample.
391
        self.__ordered_database = [];       # List of OPTICS objects in traverse order. 
392
        
393
        self.__clusters = None;     # Result of clustering (list of clusters where each cluster contains indexes of objects from input data).
394
        self.__noise = None;        # Result of clustering (noise).
395
396
397
    def __allocate_clusters(self):
398
        """!
399
        @brief Performs cluster allocation and builds ordering diagram that is based on reachability-distances.
400
        
401
        """
402
        
403
        self.__initialize(self.__sample_pointer);
404
        
405
        for optic_object in self.__optics_objects:
406
            if (optic_object.processed is False):
407
                self.__expand_cluster_order(optic_object);
408
        
409
        self.__extract_clusters();
410
    
411
    
412
    def get_clusters(self):
413
        """!
414
        @brief Returns list of allocated clusters, where each cluster contains indexes of objects and each cluster is represented by list.
415
        
416
        @return (list) List of allocated clusters.
417
        
418
        @see process()
419
        @see get_noise()
420
        @see get_ordering()
421
        @see get_radius()
422
        
423
        """
424
        
425
        return self.__clusters;
426
    
427
    
428
    def get_noise(self):
429
        """!
430
        @brief Returns list of noise that contains indexes of objects that corresponds to input data.
431
        
432
        @return (list) List of allocated noise objects.
433
        
434
        @see process()
435
        @see get_clusters()
436
        @see get_ordering()
437
        @see get_radius()
438
        
439
        """
440
        
441
        return self.__noise;
442
    
443
    
444
    def get_ordering(self):
445
        """!
446
        @brief Returns clustering ordering information about the input data set.
447
        @details Clustering ordering of data-set contains the information about the internal clustering structure in line with connectivity radius.
448
        
449
        @return (ordering_analyser) Analyser of clustering ordering.
450
        
451
        @see process()
452
        @see get_clusters()
453
        @see get_noise()
454
        @see get_radius()
455
        
456
        """
457
        
458
        if (self.__ordering is None):
459
            self.__ordering = [];
460
        
461
            for cluster in self.__clusters:
462
                for index_object in cluster:
463
                    optics_object = self.__optics_objects[index_object];
464
                    if (optics_object.reachability_distance is not None):
465
                        self.__ordering.append(optics_object.reachability_distance);
466
            
467
        return self.__ordering;
468
    
469
    
470
    def get_radius(self):
471
        """!
472
        @brief Returns connectivity radius that is calculated and used for clustering by the algorithm.
473
        @details Connectivity radius may be changed only in case of usage additional parameter of the algorithm - amount of clusters for allocation.
474
        
475
        @return (double) Connectivity radius.
476
        
477
        @see get_ordering()
478
        @see get_clusters()
479
        @see get_noise()
480
        
481
        """
482
        
483
        return self.__eps;
484
    
485
486
    def get_cluster_encoding(self):
487
        """!
488
        @brief Returns clustering result representation type that indicate how clusters are encoded.
489
        
490
        @return (type_encoding) Clustering result representation.
491
        
492
        @see get_clusters()
493
        
494
        """
495
        
496
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
497
498
499
    def __expand_cluster_order(self, optics_object):
500
        """!
501
        @brief Expand cluster order from not processed optic-object that corresponds to object from input data.
502
               Traverse procedure is performed until objects are reachable from core-objects in line with connectivity radius.
503
               Order database is updated during expanding.
504
               
505
        @param[in] optics_object (optics_descriptor): Object that hasn't been processed.
506
        
507
        """
508
        
509
        optics_object.processed = True;
510
        
511
        neighbors_descriptor = self.__neighbor_indexes(optics_object);
512
        optics_object.reachability_distance = None;
513
        
514
        self.__ordered_database.append(optics_object);
515
        
516
        # Check core distance
517
        if (len(neighbors_descriptor) >= self.__minpts):
518
            neighbors_descriptor.sort(key = lambda obj: obj[1]);
519
            optics_object.core_distance = neighbors_descriptor[self.__minpts - 1][1];
520
            
521
            # Continue processing
522
            order_seed = list();
523
            self.__update_order_seed(optics_object, neighbors_descriptor, order_seed);
524
            
525
            while(len(order_seed) > 0):
526
                optic_descriptor = order_seed[0];
527
                order_seed.remove(optic_descriptor);
528
                
529
                neighbors_descriptor = self.__neighbor_indexes(optic_descriptor);
530
                optic_descriptor.processed = True;
531
                
532
                self.__ordered_database.append(optic_descriptor);
533
                
534
                if (len(neighbors_descriptor) >= self.__minpts):
535
                    neighbors_descriptor.sort(key = lambda obj: obj[1]);
536
                    optic_descriptor.core_distance = neighbors_descriptor[self.__minpts - 1][1];
537
                    
538
                    self.__update_order_seed(optic_descriptor, neighbors_descriptor, order_seed);
539
                else:
540
                    optic_descriptor.core_distance = None;
541
                    
542
        else:
543
            optics_object.core_distance = None;
544
545
    
546
    def __extract_clusters(self):
547
        """!
548
        @brief Extract clusters and noise from order database.
549
        
550
        """
551
     
552
        self.__clusters = [];
553
        self.__noise = [];
554
555
        current_cluster = self.__noise;
556
        for optics_object in self.__ordered_database:
557
            if ((optics_object.reachability_distance is None) or (optics_object.reachability_distance > self.__eps)):
558
                if ((optics_object.core_distance is not None) and (optics_object.core_distance <= self.__eps)):
559
                    self.__clusters.append([ optics_object.index_object ]);
560
                    current_cluster = self.__clusters[-1];
561
                else:
562
                    self.__noise.append(optics_object.index_object);
563
            else:
564
                current_cluster.append(optics_object.index_object);
565
566
567
    def __update_order_seed(self, optic_descriptor, neighbors_descriptors, order_seed):
568
        """!
569
        @brief Update sorted list of reachable objects (from core-object) that should be processed using neighbors of core-object.
570
        
571
        @param[in] optic_descriptor (optics_descriptor): Core-object whose neighbors should be analysed.
572
        @param[in] neighbors_descriptors (list): List of neighbors of core-object.
573
        @param[in|out] order_seed (list): List of sorted object in line with reachable distance.
574
        
575
        """
576
        
577
        for neighbor_descriptor in neighbors_descriptors:
578
            index_neighbor = neighbor_descriptor[0];
579
            current_reachable_distance = neighbor_descriptor[1];
580
            
581
            if (self.__optics_objects[index_neighbor].processed != True):
582
                reachable_distance = max(current_reachable_distance, optic_descriptor.core_distance);
583
                if (self.__optics_objects[index_neighbor].reachability_distance is None):
584
                    self.__optics_objects[index_neighbor].reachability_distance = reachable_distance;
585
                    
586
                    # insert element in queue O(n) - worst case.
587
                    index_insertion = len(order_seed);
588
                    for index_seed in range(0, len(order_seed)):
589
                        if (reachable_distance < order_seed[index_seed].reachability_distance):
590
                            index_insertion = index_seed;
591
                            break;
592
                    
593
                    order_seed.insert(index_insertion, self.__optics_objects[index_neighbor]);
594
595
                else:
596
                    if (reachable_distance < self.__optics_objects[index_neighbor].reachability_distance):
597
                        self.__optics_objects[index_neighbor].reachability_distance = reachable_distance;
598
                        order_seed.sort(key = lambda obj: obj.reachability_distance);
599
600
601
    def __neighbor_indexes(self, optic_object):
602
        """!
603
        @brief Return list of indexes of neighbors of specified point for the data.
604
        
605
        @param[in] optic_object (optics_descriptor): Object for which neighbors should be returned in line with connectivity radius.
606
        
607
        @return (list) List of indexes of neighbors in line the connectivity radius.
608
        
609
        """
610
              
611
        neighbor_description = [];
612
        
613
        for index in range(0, len(self.__sample_pointer), 1):
614
            if (index == optic_object.index_object):
615
                continue;
616
            
617
            distance = euclidean_distance(self.__sample_pointer[optic_object.index_object], self.__sample_pointer[index]);
618
            if (distance <= self.__eps):
619
                neighbor_description.append( [index, distance] );
620
            
621
        return neighbor_description;