Completed
Push — master ( 08f56f...b517ee )
by Andrei
01:35
created

optics.process()   A

Complexity

Conditions 4

Size

Total Lines 20

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
dl 0
loc 20
rs 9.2
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: OPTICS (Ordering Points To Identify Clustering Structure)
4
@details Based on article description:
5
         - M.Ankerst, M.Breunig, H.Kriegel, J.Sander. OPTICS: Ordering Points To Identify the Clustering Structure. 1999.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2016
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
from pyclustering.utils import euclidean_distance;
29
30
31
class optics_descriptor:
32
    """!
33
    @brief Object description that used by OPTICS algorithm for cluster analysis.
34
    
35
    """
36
    
37
    def __init__(self, index, core_distance = None, reachability_distance = None):
38
        """!
39
        @brief Constructor of object description in optics terms.
40
        
41
        @param[in] index (uint): Index of the object in the data set.
42
        @param[in] core_distance (double): Core distance that is minimum distance to specified number of neighbors.
43
        @param[in] reachability_distance (double): Reachability distance to this object.
44
        
45
        """
46
        
47
        ## Reachability distance - the smallest distance to be reachable by core object.
48
        self.index_object = index;
49
        
50
        ## Core distance - the smallest distance to reach specified number of neighbors that is not greater then connectivity radius.
51
        self.core_distance = core_distance;
52
        
53
        ## Index of object from the input data.
54
        self.reachability_distance = reachability_distance;
55
        
56
        ## True is object has been already traversed.
57
        self.processed = False;
58
        
59
    def __repr__(self):
60
        """!
61
        @brief Returns string representation of the optics descriptor.
62
        
63
        """
64
        
65
        return '(%s, [c: %s, r: %s])' % (self.index_object, self.core_distance, self.reachability_distance);
66
67
68
class optics:
69
    """!
70
    @brief Class represents clustering algorithm OPTICS (Ordering Points To Identify Clustering Structure).
71
    @details OPTICS is a density-based algorithm. Purpose of the algorithm is to provide explicit clusters, but create clustering-ordering representation of the input data. 
72
             Clustering-ordering information contains information about internal structures of data set in terms of density and proper connectivity radius can be obtained
73
             for allocation required amount of clusters using this diagram. In case of usage additional input parameter 'amount of clusters' connectivity radius should be
74
             bigger than real - because it will be calculated by the algorithms.
75
76
    Example:
77
    @code
78
        # Read sample for clustering from some file
79
        sample = read_sample(path_sample);
80
        
81
        # Create OPTICS algorithm for cluster analysis
82
        optics_instance = optics(sample, 0.5, 6);
83
        
84
        # Run cluster analysis
85
        optics_instance.process();
86
        
87
        # Obtain results of clustering
88
        clusters = optics_instance.get_clusters();
89
        noise = optics_instance.get_noise();
90
        
91
        # Obtain rechability-distances
92
        ordering = optics_instance.get_cluster_ordering();
93
        
94
        # Visualization of cluster ordering in line with reachability distance.
95
        indexes = [i for i in range(0, len(ordering))];
96
        plt.bar(indexes, ordering);
97
        plt.show(); 
98
    @endcode
99
    
100
    Amount of clusters that should be allocated can be also specified. In this case connectivity radius should be greater than real, for example:
101
    @code
102
        # Import required packages
103
        from pyclustering.cluster.optics import optics;
104
        from pyclustering.samples.definitions import FCPS_SAMPLES;
105
        from pyclustering.utils import read_sample;
106
        
107
        # Read sample for clustering from some file
108
        sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN);
109
        
110
        # Run cluster analysis where connvectivity radius is bigger than real
111
        radius = 2.0;
112
        neighbors = 3;
113
        amount_of_clusters = 3;
114
        
115
        optics_instance = optics(sample, radius, neighbors, amount_of_clusters);
116
        
117
        # Obtain results of clustering
118
        clusters = optics_instance.get_clusters();
119
        noise = optics_instance.get_noise();
120
    @endcode
121
    
122
    """
123
    
124
    def __init__(self, sample, eps, minpts, amount_clusters = None):
125
        """!
126
        @brief Constructor of clustering algorithm OPTICS.
127
        
128
        @param[in] sample (list): Input data that is presented as a list of points (objects), where each point is represented by list or tuple.
129
        @param[in] eps (double): Connectivity radius between points, points may be connected if distance between them less than the radius.
130
        @param[in] minpts (uint): Minimum number of shared neighbors that is required for establishing links between points.
131
        @param[in] amount_clusters (uint): Optional parameter where amount of clusters that should be allocated is specified.
132
                    In case of usage 'amount_clusters' connectivity radius can be greater than real, in other words, there is place for mistake
133
                    in connectivity radius usage.
134
        
135
        """
136
        
137
        self.__sample_pointer = sample;     # Algorithm parameter - pointer to sample for processing.
138
        self.__eps = eps;                   # Algorithm parameter - connectivity radius between object for establish links between object.
139
        self.__minpts = minpts;             # Algorithm parameter - minimum number of neighbors that is required for establish links between object.
140
        self.__amount_clusters = amount_clusters;
141
        
142
        self.__initialize(sample);
143
144
145
    def process(self):
146
        """!
147
        @brief Performs cluster analysis in line with rules of OPTICS algorithm.
148
        
149
        @remark Results of clustering can be obtained using corresponding gets methods.
150
        
151
        @see get_clusters()
152
        @see get_noise()
153
        @see get_cluster_ordering()
154
        
155
        """
156
        
157
        self.__allocate_clusters();
158
        
159
        if ( (self.__amount_clusters is not None) and (self.__amount_clusters != len(self.get_clusters())) ):
160
            radius = self.__calculate_connectivity_radius();
161
            if (radius is not None):
162
                self.__eps = radius;
163
                self.__initialize(self.__sample_pointer);
164
                self.__allocate_clusters();
165
166
167
    def __initialize(self, sample):
168
        """!
169
        @brief Initializes internal states and resets clustering results in line with input sample.
170
        
171
        """
172
        
173
        self.__processed = [False] * len(sample);
174
        self.__optics_objects = [optics_descriptor(i) for i in range(len(sample))];     # List of OPTICS objects that corresponds to objects from input sample.
175
        self.__ordered_database = [];       # List of OPTICS objects in traverse order. 
176
        
177
        self.__clusters = None;     # Result of clustering (list of clusters where each cluster contains indexes of objects from input data).
178
        self.__noise = None;        # Result of clustering (noise).
179
180
181
    def __allocate_clusters(self):
182
        """!
183
        @brief Performs cluster allocation and builds ordering diagram that is based on reachability-distances.
184
        
185
        """
186
        
187
        for optic_object in self.__optics_objects:
188
            if (optic_object.processed is False):
189
                self.__expand_cluster_order(optic_object);
190
        
191
        self.__extract_clusters();
192
    
193
    
194
    def __calculate_connectivity_radius(self):
195
        """!
196
        @brief Calculates connectivity radius of allocation specified amount of clusters using ordering diagram.
197
        
198
        @return (double) Value of connectivity radius.
199
        
200
        """
201
        
202
        ordering = self.get_cluster_ordering();
203
        
204
        maximum_distance = max(ordering);
205
        
206
        upper_distance = maximum_distance;
207
        lower_distance = 0;
208
        
209
        radius = None;
210
        
211
        if (self.__get_amount_clusters_by_ordering(ordering, maximum_distance) <= self.__amount_clusters):
212
            while(True):
213
                radius = (lower_distance + upper_distance) / 2.0;
214
                
215
                amount_clusters = self.__get_amount_clusters_by_ordering(ordering, radius);
216
                if (amount_clusters == self.__amount_clusters):
217
                    break;
218
                
219
                elif (amount_clusters > self.__amount_clusters):
220
                    lower_distance = radius;
221
                
222
                elif (amount_clusters < self.__amount_clusters):
223
                    upper_distance = radius;
224
        
225
        return radius;
226
    
227
    
228
    def __get_amount_clusters_by_ordering(self, ordering, radius):
229
        """!
230
        @brief Obtains amount of clustering that can be allocated by using specified radius for ordering diagram.
231
        @details When growth of reachability-distance  is detected than it is considered as a start point of cluster, 
232
                 than pick is detected and after that recession is observed until new growth (that means end of the
233
                 current cluster and start of a new one) or end of diagram.
234
        
235
        @param[in] ordering (list): list of reachability-distances of optics objects.
236
        @param[in] radius (double): connectivity radius that is used for cluster allocation.
237
        
238
        
239
        @return (unit) Amount of clusters that can be allocated by the connectivity radius on ordering diagram.
240
        
241
        """
242
        
243
        amount_clusters = 1;
244
        
245
        cluster_start = False;
246
        cluster_pick = False;
247
        previous_distance = 0;
248
        
249
        for distance in ordering:
250
            if (distance >= radius):
251
                if (cluster_start is False):
252
                    amount_clusters += 1;
253
                    cluster_start = True;
254
                
255
                elif ( (cluster_start is True) and (distance < previous_distance) and (cluster_pick is False) ):
256
                    cluster_pick = True;
257
                
258
                elif ( (cluster_start is True) and (distance > previous_distance) and (cluster_pick is True) ):
259
                    cluster_start = True;
260
                    cluster_pick = False;
261
                    
262
                    amount_clusters += 1;
263
                
264
                previous_distance = distance;
265
            
266
            else:
267
                cluster_start = False;
268
                cluster_pick = False;
269
        
270
        return amount_clusters;
271
    
272
    
273
    def get_clusters(self):
274
        """!
275
        @brief Returns list of allocated clusters, where each cluster contains indexes of objects and each cluster is represented by list.
276
        
277
        @return (list) List of allocated clusters.
278
        
279
        @see process()
280
        @see get_noise()
281
        @see get_cluster_ordering()
282
        
283
        """
284
        
285
        return self.__clusters;
286
    
287
    
288
    def get_noise(self):
289
        """!
290
        @brief Returns list of noise that contains indexes of objects that corresponds to input data.
291
        
292
        @return (list) List of allocated noise objects.
293
        
294
        @see process()
295
        @see get_clusters()
296
        @see get_cluster_ordering()
297
        
298
        """
299
        
300
        return self.__noise;
301
    
302
    
303
    def get_cluster_ordering(self):
304
        """!
305
        @brief Returns clustering ordering information about the input data set.
306
        @details Clustering ordering of data-set contains the information about the internal clustering structure in line with connectivity radius.
307
        
308
        @return (list) List of reachability distances (clustering ordering).
309
        
310
        @see process()
311
        @see get_clusters()
312
        @see get_noise()
313
        
314
        """
315
        
316
        ordering = [];
317
        
318
        for cluster in self.__clusters:
319
            for index_object in cluster:
320
                optics_object = self.__optics_objects[index_object];
321
                if (optics_object.reachability_distance is not None):
322
                    ordering.append(optics_object.reachability_distance);
323
        
324
        return ordering;
325
    
326
    
327
    def __expand_cluster_order(self, optics_object):
328
        """!
329
        @brief Expand cluster order from not processed optic-object that corresponds to object from input data.
330
               Traverse procedure is performed until objects are reachable from core-objects in line with connectivity radius.
331
               Order database is updated during expanding.
332
               
333
        @param[in] optics_object (optics_descriptor): Object that hasn't been processed.
334
        
335
        """
336
        
337
        optics_object.processed = True;
338
        
339
        neighbors_descriptor = self.__neighbor_indexes(optics_object);
340
        optics_object.reachability_distance = None;
341
        
342
        self.__ordered_database.append(optics_object);
343
        
344
        # Check core distance
345
        if (len(neighbors_descriptor) >= self.__minpts):
346
            neighbors_descriptor.sort(key = lambda obj: obj[1]);
347
            optics_object.core_distance = neighbors_descriptor[self.__minpts - 1][1];
348
            
349
            # Continue processing
350
            order_seed = list();
351
            self.__update_order_seed(optics_object, neighbors_descriptor, order_seed);
352
            
353
            while(len(order_seed) > 0):
354
                optic_descriptor = order_seed[0];
355
                order_seed.remove(optic_descriptor);
356
                
357
                neighbors_descriptor = self.__neighbor_indexes(optic_descriptor);
358
                optic_descriptor.processed = True;
359
                
360
                self.__ordered_database.append(optic_descriptor);
361
                
362
                if (len(neighbors_descriptor) >= self.__minpts):
363
                    neighbors_descriptor.sort(key = lambda obj: obj[1]);
364
                    optic_descriptor.core_distance = neighbors_descriptor[self.__minpts - 1][1];
365
                    
366
                    self.__update_order_seed(optic_descriptor, neighbors_descriptor, order_seed);
367
                else:
368
                    optic_descriptor.core_distance = None;
369
                    
370
        else:
371
            optics_object.core_distance = None;
372
373
    
374
    def __extract_clusters(self):
375
        """!
376
        @brief Extract clusters and noise from order database.
377
        
378
        """
379
     
380
        self.__clusters = [];
381
        self.__noise = [];
382
383
        current_cluster = [];
384
        for optics_object in self.__ordered_database:
385
            if ((optics_object.reachability_distance is None) or (optics_object.reachability_distance > self.__eps)):
386
                if ((optics_object.core_distance is not None) and (optics_object.core_distance <= self.__eps)):
387
                    if (len(current_cluster) > 0):
388
                        self.__clusters.append(current_cluster);
389
                        current_cluster = [];
390
                        
391
                    current_cluster.append(optics_object.index_object);
392
                else:
393
                    self.__noise.append(optics_object.index_object);
394
            else:
395
                current_cluster.append(optics_object.index_object);
396
        
397
        if (len(current_cluster) > 0):
398
            self.__clusters.append(current_cluster);
399
                
400
401
    def __update_order_seed(self, optic_descriptor, neighbors_descriptors, order_seed):
402
        """!
403
        @brief Update sorted list of reachable objects (from core-object) that should be processed using neighbors of core-object.
404
        
405
        @param[in] optic_descriptor (optics_descriptor): Core-object whose neighbors should be analysed.
406
        @param[in] neighbors_descriptors (list): List of neighbors of core-object.
407
        @param[in|out] order_seed (list): List of sorted object in line with reachable distance.
408
        
409
        """
410
        
411
        for neighbor_descriptor in neighbors_descriptors:
412
            index_neighbor = neighbor_descriptor[0];
413
            current_reachable_distance = neighbor_descriptor[1];
414
            
415
            if (self.__optics_objects[index_neighbor].processed != True):
416
                reachable_distance = max(current_reachable_distance, optic_descriptor.core_distance);
417
                if (self.__optics_objects[index_neighbor].reachability_distance is None):
418
                    self.__optics_objects[index_neighbor].reachability_distance = reachable_distance;
419
                    
420
                    # insert element in queue O(n) - worst case.
421
                    index_insertion = len(order_seed);
422
                    for index_seed in range(0, len(order_seed)):
423
                        if (reachable_distance < order_seed[index_seed].reachability_distance):
424
                            index_insertion = index_seed;
425
                            break;
426
                    
427
                    order_seed.insert(index_insertion, self.__optics_objects[index_neighbor]);
428
429
                else:
430
                    if (reachable_distance < self.__optics_objects[index_neighbor].reachability_distance):
431
                        self.__optics_objects[index_neighbor].reachability_distance = reachable_distance;
432
                        order_seed.sort(key = lambda obj: obj.reachability_distance);
433
434
435
    def __neighbor_indexes(self, optic_object):
436
        """!
437
        @brief Return list of indexes of neighbors of specified point for the data.
438
        
439
        @param[in] optic_object (optics_descriptor): Object for which neighbors should be returned in line with connectivity radius.
440
        
441
        @return (list) List of indexes of neighbors in line the connectivity radius.
442
        
443
        """
444
              
445
        neighbor_description = [];
446
        
447
        for index in range(0, len(self.__sample_pointer), 1):
448
            if (index == optic_object.index_object):
449
                continue;
450
            
451
            distance = euclidean_distance(self.__sample_pointer[optic_object.index_object], self.__sample_pointer[index]);
452
            if (distance <= self.__eps):
453
                neighbor_description.append( [index, distance] );
454
            
455
        return neighbor_description;