Completed
Push — master ( 6fa201...1acbc9 )
by Andrei
01:59
created

ordering_analyser.__init__()   A

Complexity

Conditions 1

Size

Total Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
dl 0
loc 8
rs 9.4285
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: OPTICS (Ordering Points To Identify Clustering Structure)
4
@details Based on article description:
5
         - M.Ankerst, M.Breunig, H.Kriegel, J.Sander. OPTICS: Ordering Points To Identify the Clustering Structure. 1999.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2016
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
from pyclustering.utils import euclidean_distance;
29
30
import matplotlib.pyplot as plt;
0 ignored issues
show
Configuration introduced by
The import matplotlib.pyplot could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
31
32
33
class ordering_visualizer:
34
    """!
35
    @brief Cluster ordering diagram visualizer that represents dataset graphically.
36
    
37
    """
38
    
39
    @staticmethod
40
    def show_ordering_diagram(analyser):
41
        ordering = analyser.cluster_ordering;
42
        indexes = [i for i in range(0, len(ordering))];
43
        
44
        plt.bar(indexes, ordering);
45
        plt.show();
46
47
48
class ordering_analyser:
49
    """!
50
    @brief Analyser of cluster ordering diagram.
51
    
52
    """
53
    
54
    @property
55
    def cluster_ordering(self):
56
        """!
57
        @brief (list) Returns values of dataset cluster ordering.
58
        
59
        """
60
        return self.__ordering;
61
    
62
    
63
    def __init__(self, ordering_diagram):
64
        """!
65
        @brief Analyser of ordering diagram that is based on reachability-distances.
66
        
67
        @see calculate_connvectivity_radius
68
        
69
        """
70
        self.__ordering = ordering_diagram;
71
    
72
    
73
    def __len__(self):
74
        """!
75
        @brief Returns length of clustering-ordering diagram.
76
        
77
        """
78
        return len(self.__ordering);
79
    
80
    
81
    def calculate_connvectivity_radius(self, amount_clusters):
82
        """!
83
        @brief Calculates connectivity radius of allocation specified amount of clusters using ordering diagram.
84
        
85
        @return (double) Value of connectivity radius.
86
        
87
        """
88
        
89
        maximum_distance = max(self.__ordering);
90
        
91
        upper_distance = maximum_distance;
92
        lower_distance = 0.0;
93
        
94
        radius = None;
95
        
96
        if (self.extract_cluster_amount(maximum_distance) <= amount_clusters):
97
            while(True):
98
                radius = (lower_distance + upper_distance) / 2.0;
99
                
100
                amount = self.extract_cluster_amount(radius);
101
                if (amount == amount_clusters):
102
                    break;
103
                
104
                elif (amount > amount_clusters):
105
                    lower_distance = radius;
106
                
107
                elif (amount < amount_clusters):
108
                    upper_distance = radius;
109
        
110
        return radius;
111
    
112
    
113
    def extract_cluster_amount(self, radius):
114
        """!
115
        @brief Obtains amount of clustering that can be allocated by using specified radius for ordering diagram.
116
        @details When growth of reachability-distances is detected than it is considered as a start point of cluster, 
117
                 than pick is detected and after that recession is observed until new growth (that means end of the
118
                 current cluster and start of a new one) or end of diagram.
119
        
120
        @param[in] ordering (list): list of reachability-distances of optics objects.
121
        @param[in] radius (double): connectivity radius that is used for cluster allocation.
122
        
123
        
124
        @return (unit) Amount of clusters that can be allocated by the connectivity radius on ordering diagram.
125
        
126
        """
127
        
128
        amount_clusters = 1;
129
        
130
        cluster_start = False;
131
        cluster_pick = False;
132
        previous_distance = 0.0;
133
        
134
        for distance in self.__ordering:
135
            if (distance >= radius):
136
                if (cluster_start is False):
137
                    cluster_start = True;
138
                    amount_clusters += 1;
139
                
140
                else:
141
                    if ((distance < previous_distance) and (cluster_pick is False)):
142
                        cluster_pick = True;
143
                    
144
                    elif ((distance > previous_distance) and (cluster_pick is True)):
145
                        cluster_pick = False;
146
                        amount_clusters += 1;
147
                
148
                previous_distance = distance;
149
            
150
            else:
151
                cluster_start = False;
152
                cluster_pick = False;
153
        
154
        return amount_clusters;
155
156
157
class optics_descriptor:
158
    """!
159
    @brief Object description that used by OPTICS algorithm for cluster analysis.
160
    
161
    """
162
    
163
    def __init__(self, index, core_distance = None, reachability_distance = None):
164
        """!
165
        @brief Constructor of object description in optics terms.
166
        
167
        @param[in] index (uint): Index of the object in the data set.
168
        @param[in] core_distance (double): Core distance that is minimum distance to specified number of neighbors.
169
        @param[in] reachability_distance (double): Reachability distance to this object.
170
        
171
        """
172
        
173
        ## Reachability distance - the smallest distance to be reachable by core object.
174
        self.index_object = index;
175
        
176
        ## Core distance - the smallest distance to reach specified number of neighbors that is not greater then connectivity radius.
177
        self.core_distance = core_distance;
178
        
179
        ## Index of object from the input data.
180
        self.reachability_distance = reachability_distance;
181
        
182
        ## True is object has been already traversed.
183
        self.processed = False;
184
        
185
    def __repr__(self):
186
        """!
187
        @brief Returns string representation of the optics descriptor.
188
        
189
        """
190
        
191
        return '(%s, [c: %s, r: %s])' % (self.index_object, self.core_distance, self.reachability_distance);
192
193
194
class optics:
195
    """!
196
    @brief Class represents clustering algorithm OPTICS (Ordering Points To Identify Clustering Structure).
197
    @details OPTICS is a density-based algorithm. Purpose of the algorithm is to provide explicit clusters, but create clustering-ordering representation of the input data. 
198
             Clustering-ordering information contains information about internal structures of data set in terms of density and proper connectivity radius can be obtained
199
             for allocation required amount of clusters using this diagram. In case of usage additional input parameter 'amount of clusters' connectivity radius should be
200
             bigger than real - because it will be calculated by the algorithms.
201
202
    Example:
203
    @code
204
        # Read sample for clustering from some file
205
        sample = read_sample(path_sample);
206
        
207
        # Create OPTICS algorithm for cluster analysis
208
        optics_instance = optics(sample, 0.5, 6);
209
        
210
        # Run cluster analysis
211
        optics_instance.process();
212
        
213
        # Obtain results of clustering
214
        clusters = optics_instance.get_clusters();
215
        noise = optics_instance.get_noise();
216
        
217
        # Obtain rechability-distances
218
        ordering = ordering_analyser(optics_instance.get_ordering());
219
        
220
        # Visualization of cluster ordering in line with reachability distance.
221
        ordering_visualizer.show_ordering_diagram(ordering);
222
    @endcode
223
    
224
    Amount of clusters that should be allocated can be also specified. In this case connectivity radius should be greater than real, for example:
225
    @code
226
        # Import required packages
227
        from pyclustering.cluster.optics import optics;
228
        from pyclustering.samples.definitions import FCPS_SAMPLES;
229
        from pyclustering.utils import read_sample;
230
        
231
        # Read sample for clustering from some file
232
        sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN);
233
        
234
        # Run cluster analysis where connvectivity radius is bigger than real
235
        radius = 2.0;
236
        neighbors = 3;
237
        amount_of_clusters = 3;
238
        
239
        optics_instance = optics(sample, radius, neighbors, amount_of_clusters);
240
        
241
        # Obtain results of clustering
242
        clusters = optics_instance.get_clusters();
243
        noise = optics_instance.get_noise();
244
    @endcode
245
    
246
    """
247
    
248
    def __init__(self, sample, eps, minpts, amount_clusters = None):
249
        """!
250
        @brief Constructor of clustering algorithm OPTICS.
251
        
252
        @param[in] sample (list): Input data that is presented as a list of points (objects), where each point is represented by list or tuple.
253
        @param[in] eps (double): Connectivity radius between points, points may be connected if distance between them less than the radius.
254
        @param[in] minpts (uint): Minimum number of shared neighbors that is required for establishing links between points.
255
        @param[in] amount_clusters (uint): Optional parameter where amount of clusters that should be allocated is specified.
256
                    In case of usage 'amount_clusters' connectivity radius can be greater than real, in other words, there is place for mistake
257
                    in connectivity radius usage.
258
        
259
        """
260
        
261
        self.__sample_pointer = sample;     # Algorithm parameter - pointer to sample for processing.
262
        self.__eps = eps;                   # Algorithm parameter - connectivity radius between object for establish links between object.
263
        self.__minpts = minpts;             # Algorithm parameter - minimum number of neighbors that is required for establish links between object.
264
        self.__amount_clusters = amount_clusters;
265
        self.__ordering = None;
266
        
267
        self.__initialize(sample);
268
269
270
    def process(self):
271
        """!
272
        @brief Performs cluster analysis in line with rules of OPTICS algorithm.
273
        
274
        @remark Results of clustering can be obtained using corresponding gets methods.
275
        
276
        @see get_clusters()
277
        @see get_noise()
278
        @see get_ordering()
279
        
280
        """
281
        
282
        self.__allocate_clusters();
283
        
284
        if ( (self.__amount_clusters is not None) and (self.__amount_clusters != len(self.get_clusters())) ):
285
            analyser = ordering_analyser(self.get_ordering());
286
            radius = analyser.calculate_connvectivity_radius(self.__amount_clusters);
287
            if (radius is not None):
288
                self.__eps = radius;
289
                self.__initialize(self.__sample_pointer);
290
                self.__allocate_clusters();
291
292
293
    def __initialize(self, sample):
294
        """!
295
        @brief Initializes internal states and resets clustering results in line with input sample.
296
        
297
        """
298
        
299
        self.__processed = [False] * len(sample);
300
        self.__optics_objects = [optics_descriptor(i) for i in range(len(sample))];     # List of OPTICS objects that corresponds to objects from input sample.
301
        self.__ordered_database = [];       # List of OPTICS objects in traverse order. 
302
        
303
        self.__clusters = None;     # Result of clustering (list of clusters where each cluster contains indexes of objects from input data).
304
        self.__noise = None;        # Result of clustering (noise).
305
306
307
    def __allocate_clusters(self):
308
        """!
309
        @brief Performs cluster allocation and builds ordering diagram that is based on reachability-distances.
310
        
311
        """
312
        
313
        for optic_object in self.__optics_objects:
314
            if (optic_object.processed is False):
315
                self.__expand_cluster_order(optic_object);
316
        
317
        self.__extract_clusters();
318
    
319
    
320
    def get_clusters(self):
321
        """!
322
        @brief Returns list of allocated clusters, where each cluster contains indexes of objects and each cluster is represented by list.
323
        
324
        @return (list) List of allocated clusters.
325
        
326
        @see process()
327
        @see get_noise()
328
        @see get_ordering()
329
        @see get_radius()
330
        
331
        """
332
        
333
        return self.__clusters;
334
    
335
    
336
    def get_noise(self):
337
        """!
338
        @brief Returns list of noise that contains indexes of objects that corresponds to input data.
339
        
340
        @return (list) List of allocated noise objects.
341
        
342
        @see process()
343
        @see get_clusters()
344
        @see get_ordering()
345
        @see get_radius()
346
        
347
        """
348
        
349
        return self.__noise;
350
    
351
    
352
    def get_ordering(self):
353
        """!
354
        @brief Returns clustering ordering information about the input data set.
355
        @details Clustering ordering of data-set contains the information about the internal clustering structure in line with connectivity radius.
356
        
357
        @return (ordering_analyser) Analyser of clustering ordering.
358
        
359
        @see process()
360
        @see get_clusters()
361
        @see get_noise()
362
        @see get_radius()
363
        
364
        """
365
        
366
        if (self.__ordering is None):
367
            self.__ordering = [];
368
        
369
            for cluster in self.__clusters:
370
                for index_object in cluster:
371
                    optics_object = self.__optics_objects[index_object];
372
                    if (optics_object.reachability_distance is not None):
373
                        self.__ordering.append(optics_object.reachability_distance);
374
            
375
        return self.__ordering;
376
    
377
    
378
    def get_radius(self):
379
        """!
380
        @brief Returns connectivity radius that is calculated and used for clustering by the algorithm.
381
        @details Connectivity radius may be changed only in case of usage additional parameter of the algorithm - amount of clusters for allocation.
382
        
383
        @return (double) Connectivity radius.
384
        
385
        @see get_ordering()
386
        @see get_clusters()
387
        @see get_noise()
388
        
389
        """
390
        
391
        return self.__eps;
392
    
393
    
394
    def __expand_cluster_order(self, optics_object):
395
        """!
396
        @brief Expand cluster order from not processed optic-object that corresponds to object from input data.
397
               Traverse procedure is performed until objects are reachable from core-objects in line with connectivity radius.
398
               Order database is updated during expanding.
399
               
400
        @param[in] optics_object (optics_descriptor): Object that hasn't been processed.
401
        
402
        """
403
        
404
        optics_object.processed = True;
405
        
406
        neighbors_descriptor = self.__neighbor_indexes(optics_object);
407
        optics_object.reachability_distance = None;
408
        
409
        self.__ordered_database.append(optics_object);
410
        
411
        # Check core distance
412
        if (len(neighbors_descriptor) >= self.__minpts):
413
            neighbors_descriptor.sort(key = lambda obj: obj[1]);
414
            optics_object.core_distance = neighbors_descriptor[self.__minpts - 1][1];
415
            
416
            # Continue processing
417
            order_seed = list();
418
            self.__update_order_seed(optics_object, neighbors_descriptor, order_seed);
419
            
420
            while(len(order_seed) > 0):
421
                optic_descriptor = order_seed[0];
422
                order_seed.remove(optic_descriptor);
423
                
424
                neighbors_descriptor = self.__neighbor_indexes(optic_descriptor);
425
                optic_descriptor.processed = True;
426
                
427
                self.__ordered_database.append(optic_descriptor);
428
                
429
                if (len(neighbors_descriptor) >= self.__minpts):
430
                    neighbors_descriptor.sort(key = lambda obj: obj[1]);
431
                    optic_descriptor.core_distance = neighbors_descriptor[self.__minpts - 1][1];
432
                    
433
                    self.__update_order_seed(optic_descriptor, neighbors_descriptor, order_seed);
434
                else:
435
                    optic_descriptor.core_distance = None;
436
                    
437
        else:
438
            optics_object.core_distance = None;
439
440
    
441
    def __extract_clusters(self):
442
        """!
443
        @brief Extract clusters and noise from order database.
444
        
445
        """
446
     
447
        self.__clusters = [];
448
        self.__noise = [];
449
450
        current_cluster = [];
451
        for optics_object in self.__ordered_database:
452
            if ((optics_object.reachability_distance is None) or (optics_object.reachability_distance > self.__eps)):
453
                if ((optics_object.core_distance is not None) and (optics_object.core_distance <= self.__eps)):
454
                    if (len(current_cluster) > 0):
455
                        self.__clusters.append(current_cluster);
456
                        current_cluster = [];
457
                        
458
                    current_cluster.append(optics_object.index_object);
459
                else:
460
                    self.__noise.append(optics_object.index_object);
461
            else:
462
                current_cluster.append(optics_object.index_object);
463
        
464
        if (len(current_cluster) > 0):
465
            self.__clusters.append(current_cluster);
466
                
467
468
    def __update_order_seed(self, optic_descriptor, neighbors_descriptors, order_seed):
469
        """!
470
        @brief Update sorted list of reachable objects (from core-object) that should be processed using neighbors of core-object.
471
        
472
        @param[in] optic_descriptor (optics_descriptor): Core-object whose neighbors should be analysed.
473
        @param[in] neighbors_descriptors (list): List of neighbors of core-object.
474
        @param[in|out] order_seed (list): List of sorted object in line with reachable distance.
475
        
476
        """
477
        
478
        for neighbor_descriptor in neighbors_descriptors:
479
            index_neighbor = neighbor_descriptor[0];
480
            current_reachable_distance = neighbor_descriptor[1];
481
            
482
            if (self.__optics_objects[index_neighbor].processed != True):
483
                reachable_distance = max(current_reachable_distance, optic_descriptor.core_distance);
484
                if (self.__optics_objects[index_neighbor].reachability_distance is None):
485
                    self.__optics_objects[index_neighbor].reachability_distance = reachable_distance;
486
                    
487
                    # insert element in queue O(n) - worst case.
488
                    index_insertion = len(order_seed);
489
                    for index_seed in range(0, len(order_seed)):
490
                        if (reachable_distance < order_seed[index_seed].reachability_distance):
491
                            index_insertion = index_seed;
492
                            break;
493
                    
494
                    order_seed.insert(index_insertion, self.__optics_objects[index_neighbor]);
495
496
                else:
497
                    if (reachable_distance < self.__optics_objects[index_neighbor].reachability_distance):
498
                        self.__optics_objects[index_neighbor].reachability_distance = reachable_distance;
499
                        order_seed.sort(key = lambda obj: obj.reachability_distance);
500
501
502
    def __neighbor_indexes(self, optic_object):
503
        """!
504
        @brief Return list of indexes of neighbors of specified point for the data.
505
        
506
        @param[in] optic_object (optics_descriptor): Object for which neighbors should be returned in line with connectivity radius.
507
        
508
        @return (list) List of indexes of neighbors in line the connectivity radius.
509
        
510
        """
511
              
512
        neighbor_description = [];
513
        
514
        for index in range(0, len(self.__sample_pointer), 1):
515
            if (index == optic_object.index_object):
516
                continue;
517
            
518
            distance = euclidean_distance(self.__sample_pointer[optic_object.index_object], self.__sample_pointer[index]);
519
            if (distance <= self.__eps):
520
                neighbor_description.append( [index, distance] );
521
            
522
        return neighbor_description;