Completed
Push — master ( 1df53c...08f56f )
by Andrei
02:02
created

optics.__neighbor_indexes()   A

Complexity

Conditions 4

Size

Total Lines 21

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
dl 0
loc 21
rs 9.0534
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: OPTICS (Ordering Points To Identify Clustering Structure)
4
@details Based on article description:
5
         - M.Ankerst, M.Breunig, H.Kriegel, J.Sander. OPTICS: Ordering Points To Identify the Clustering Structure. 1999.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2016
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
from pyclustering.utils import euclidean_distance;
29
30
31
class optics_descriptor:
32
    """!
33
    @brief Object description that used by OPTICS algorithm for cluster analysis.
34
    
35
    """
36
    
37
    def __init__(self, index, core_distance = None, reachability_distance = None):
38
        """!
39
        @brief Constructor of object description in optics terms.
40
        
41
        @param[in] index (uint): Index of the object in the data set.
42
        @param[in] core_distance (double): Core distance that is minimum distance to specified number of neighbors.
43
        @param[in] reachability_distance (double): Reachability distance to this object.
44
        
45
        """
46
        
47
        ## Reachability distance - the smallest distance to be reachable by core object.
48
        self.index_object = index;
49
        
50
        ## Core distance - the smallest distance to reach specified number of neighbors that is not greater then connectivity radius.
51
        self.core_distance = core_distance;
52
        
53
        ## Index of object from the input data.
54
        self.reachability_distance = reachability_distance;
55
        
56
        ## True is object has been already traversed.
57
        self.processed = False;
58
        
59
    def __repr__(self):
60
        """!
61
        @brief Returns string representation of the optics descriptor.
62
        
63
        """
64
        
65
        return '(%s, [c: %s, r: %s])' % (self.index_object, self.core_distance, self.reachability_distance);               
66
67
68
class optics:
69
    """!
70
    @brief Class represents clustering algorithm OPTICS (Ordering Points To Identify Clustering Structure).
71
    @details OPTICS is a density-based algorithm. Purpose of the algorithm is to provide explicit clusters, but create clustering-ordering representation of the input data. 
72
             Clustering-ordering information contains information about internal structures of data set in terms of density. 
73
74
    Example:
75
    @code
76
        # Read sample for clustering from some file
77
        sample = read_sample(path_sample);
78
        
79
        # Create OPTICS algorithm for cluster analysis
80
        optics_instance = optics(sample, 0.5, 6);
81
        
82
        # Run cluster analysis
83
        optics_instance.process();
84
        
85
        # Obtain results of clustering
86
        clusters = optics_instance.get_clusters();
87
        noise = optics_instance.get_noise();
88
        
89
        # Obtain rechability-distances
90
        ordering = optics_instance.get_cluster_ordering();
91
        
92
        # Visualization of cluster ordering in line with reachability distance.
93
        indexes = [i for i in range(0, len(ordering))];
94
        plt.bar(indexes, ordering);
95
        plt.show(); 
96
    @endcode
97
       
98
    """
99
    
100
    def __init__(self, sample, eps, minpts, amount_clusters = None):
101
        """!
102
        @brief Constructor of clustering algorithm OPTICS.
103
        
104
        @param[in] sample (list): Input data that is presented as a list of points (objects), where each point is represented by list or tuple.
105
        @param[in] eps (double): Connectivity radius between points, points may be connected if distance between them less than the radius.
106
        @param[in] minpts (uint): Minimum number of shared neighbors that is required for establishing links between points.
107
        @param[in] amount_clusters (uint): Optional parameter where amount of clusters that should be allocated is specified.
108
                    In case of usage 'amount_clusters' connectivity radius can be greater than real, in other words, there is place for mistake
109
                    in connectivity radius usage.
110
        
111
        """
112
        
113
        self.__processed = [False] * len(sample);
114
        self.__optics_objects = [optics_descriptor(i) for i in range(len(sample))];     # List of OPTICS objects that corresponds to objects from input sample.
115
        self.__ordered_database = [];       # List of OPTICS objects in traverse order. 
116
        
117
        self.__sample_pointer = sample;     # Algorithm parameter - pointer to sample for processing.
118
        self.__eps = eps;                   # Algorithm parameter - connectivity radius between object for establish links between object.
119
        self.__minpts = minpts;             # Algorithm parameter - minimum number of neighbors that is required for establish links between object.
120
        self.__amount_clusters = amount_clusters;
121
        
122
        self.__clusters = None;             # Result of clustering (list of clusters where each cluster contains indexes of objects from input data).
123
        self.__noise = None;                # Result of clustering (noise).
124
125
126
    def process(self):
127
        """!
128
        @brief Performs cluster analysis in line with rules of OPTICS algorithm.
129
        
130
        @remark Results of clustering can be obtained using corresponding gets methods.
131
        
132
        @see get_clusters()
133
        @see get_noise()
134
        @see get_cluster_ordering()
135
        
136
        """
137
        
138
        self.__allocate_clusters();
139
        
140
        if ( (self.__amount_clusters is not None) and (self.__amount_clusters != len(self.get_clusters())) ):
141
            radius = self.__calculate_connectivity_radius();
142
            if (radius is not None):
143
                self.__eps = radius;
144
                self.__initialize(self.__sample_pointer);
145
                self.__allocate_clusters();
146
147
148
    def __initialize(self, sample):
149
        """!
150
        @brief Initializes internal states and resets clustering results in line with input sample.
151
        
152
        """
153
        
154
        self.__processed = [False] * len(sample);
155
        self.__optics_objects = [optics_descriptor(i) for i in range(len(sample))];     # List of OPTICS objects that corresponds to objects from input sample.
156
        self.__ordered_database = [];       # List of OPTICS objects in traverse order. 
157
        
158
        self.__clusters = None;
159
        self.__noise = None;
160
161
162
    def __allocate_clusters(self):
163
        """!
164
        @brief Performs cluster allocation and builds ordering diagram that is based on reachability-distances.
165
        
166
        """
167
        
168
        for optic_object in self.__optics_objects:
169
            if (optic_object.processed is False):
170
                self.__expand_cluster_order(optic_object);
171
        
172
        self.__extract_clusters();
173
    
174
    
175
    def __calculate_connectivity_radius(self):
176
        """!
177
        @brief Calculates connectivity radius of allocation specified amount of clusters using ordering diagram.
178
        
179
        @return (double) Value of connectivity radius.
180
        
181
        """
182
        
183
        ordering = self.get_cluster_ordering();
184
        
185
        maximum_distance = max(ordering);
186
        
187
        upper_distance = maximum_distance;
188
        lower_distance = 0;
189
        
190
        radius = None;
191
        
192
        if (self.__get_amount_clusters_by_ordering(ordering, maximum_distance) <= self.__amount_clusters):
193
            while(True):
194
                radius = (lower_distance + upper_distance) / 2.0;
195
                
196
                amount_clusters = self.__get_amount_clusters_by_ordering(ordering, radius);
197
                if (amount_clusters == self.__amount_clusters):
198
                    break;
199
                
200
                elif (amount_clusters > self.__amount_clusters):
201
                    lower_distance = radius;
202
                
203
                elif (amount_clusters < self.__amount_clusters):
204
                    upper_distance = radius;
205
            
206
        return radius;
207
    
208
    
209
    def __get_amount_clusters_by_ordering(self, ordering, radius):
210
        """!
211
        @brief Obtains amount of clustering that can be allocated by using specified radius for ordering diagram.
212
        @details When growth of reachability-distance  is detected than it is considered as a start point of cluster, 
213
                 than pick is detected and after that recession is observed until new growth (that means end of the
214
                 current cluster and start of a new one) or end of diagram.
215
        
216
        @param[in] ordering (list): list of reachability-distances of optics objects.
217
        @param[in] radius (double): connectivity radius that is used for cluster allocation.
218
        
219
        
220
        @return (unit) Amount of clusters that can be allocated by the connectivity radius on ordering diagram.
221
        
222
        """
223
        
224
        amount_clusters = 1;
225
        
226
        cluster_start = False;
227
        cluster_pick = False;
228
        previous_distance = 0;
229
        
230
        for distance in ordering:
231
            if (distance >= radius):
232
                if (cluster_start is False):
233
                    amount_clusters += 1;
234
                    cluster_start = True;
235
                
236
                elif ( (cluster_start is True) and (distance < previous_distance) and (cluster_pick is False) ):
237
                    cluster_pick = True;
238
                
239
                elif ( (cluster_start is True) and (distance > previous_distance) and (cluster_pick is True) ):
240
                    cluster_start = True;
241
                    cluster_pick = False;
242
                    
243
                    amount_clusters += 1;
244
                
245
                previous_distance = distance;
246
            
247
            else:
248
                cluster_start = False;
249
                cluster_pick = False;
250
        
251
        return amount_clusters;
252
    
253
    
254
    def get_clusters(self):
255
        """!
256
        @brief Returns list of allocated clusters, where each cluster contains indexes of objects and each cluster is represented by list.
257
        
258
        @return (list) List of allocated clusters.
259
        
260
        @see process()
261
        @see get_noise()
262
        @see get_cluster_ordering()
263
        
264
        """
265
        
266
        return self.__clusters;
267
    
268
    
269
    def get_noise(self):
270
        """!
271
        @brief Returns list of noise that contains indexes of objects that corresponds to input data.
272
        
273
        @return (list) List of allocated noise objects.
274
        
275
        @see process()
276
        @see get_clusters()
277
        @see get_cluster_ordering()
278
        
279
        """
280
        
281
        return self.__noise;
282
    
283
    
284
    def get_cluster_ordering(self):
285
        """!
286
        @brief Returns clustering ordering information about the input data set.
287
        @details Clustering ordering of data-set contains the information about the internal clustering structure in line with connectivity radius.
288
        
289
        @return (list) List of reachability distances (clustering ordering).
290
        
291
        @see process()
292
        @see get_clusters()
293
        @see get_noise()
294
        
295
        """
296
        
297
        ordering = [];
298
        
299
        for cluster in self.__clusters:
300
            for index_object in cluster:
301
                optics_object = self.__optics_objects[index_object];
302
                if (optics_object.reachability_distance is not None):
303
                    ordering.append(optics_object.reachability_distance);
304
        
305
        return ordering;
306
    
307
    
308
    def __expand_cluster_order(self, optics_object):
309
        """!
310
        @brief Expand cluster order from not processed optic-object that corresponds to object from input data.
311
               Traverse procedure is performed until objects are reachable from core-objects in line with connectivity radius.
312
               Order database is updated during expanding.
313
               
314
        @param[in] optics_object (optics_descriptor): Object that hasn't been processed.
315
        
316
        """
317
        
318
        optics_object.processed = True;
319
        
320
        neighbors_descriptor = self.__neighbor_indexes(optics_object);
321
        optics_object.reachability_distance = None;
322
        
323
        self.__ordered_database.append(optics_object);
324
        
325
        # Check core distance
326
        if (len(neighbors_descriptor) >= self.__minpts):
327
            neighbors_descriptor.sort(key = lambda obj: obj[1]);
328
            optics_object.core_distance = neighbors_descriptor[self.__minpts - 1][1];
329
            
330
            # Continue processing
331
            order_seed = list();
332
            self.__update_order_seed(optics_object, neighbors_descriptor, order_seed);
333
            
334
            while(len(order_seed) > 0):
335
                optic_descriptor = order_seed[0];
336
                order_seed.remove(optic_descriptor);
337
                
338
                neighbors_descriptor = self.__neighbor_indexes(optic_descriptor);
339
                optic_descriptor.processed = True;
340
                
341
                self.__ordered_database.append(optic_descriptor);
342
                
343
                if (len(neighbors_descriptor) >= self.__minpts):
344
                    neighbors_descriptor.sort(key = lambda obj: obj[1]);
345
                    optic_descriptor.core_distance = neighbors_descriptor[self.__minpts - 1][1];
346
                    
347
                    self.__update_order_seed(optic_descriptor, neighbors_descriptor, order_seed);
348
                else:
349
                    optic_descriptor.core_distance = None;
350
                    
351
        else:
352
            optics_object.core_distance = None;
353
354
    
355
    def __extract_clusters(self):
356
        """!
357
        @brief Extract clusters and noise from order database.
358
        
359
        """
360
     
361
        self.__clusters = [];
362
        self.__noise = [];
363
364
        current_cluster = [];
365
        for optics_object in self.__ordered_database:
366
            if ((optics_object.reachability_distance is None) or (optics_object.reachability_distance > self.__eps)):
367
                if ((optics_object.core_distance is not None) and (optics_object.core_distance <= self.__eps)):
368
                    if (len(current_cluster) > 0):
369
                        self.__clusters.append(current_cluster);
370
                        current_cluster = [];
371
                        
372
                    current_cluster.append(optics_object.index_object);
373
                else:
374
                    self.__noise.append(optics_object.index_object);
375
            else:
376
                current_cluster.append(optics_object.index_object);
377
        
378
        if (len(current_cluster) > 0):
379
            self.__clusters.append(current_cluster);
380
                
381
382
    def __update_order_seed(self, optic_descriptor, neighbors_descriptors, order_seed):
383
        """!
384
        @brief Update sorted list of reachable objects (from core-object) that should be processed using neighbors of core-object.
385
        
386
        @param[in] optic_descriptor (optics_descriptor): Core-object whose neighbors should be analysed.
387
        @param[in] neighbors_descriptors (list): List of neighbors of core-object.
388
        @param[in|out] order_seed (list): List of sorted object in line with reachable distance.
389
        
390
        """
391
        
392
        for neighbor_descriptor in neighbors_descriptors:
393
            index_neighbor = neighbor_descriptor[0];
394
            current_reachable_distance = neighbor_descriptor[1];
395
            
396
            if (self.__optics_objects[index_neighbor].processed != True):
397
                reachable_distance = max(current_reachable_distance, optic_descriptor.core_distance);
398
                if (self.__optics_objects[index_neighbor].reachability_distance is None):
399
                    self.__optics_objects[index_neighbor].reachability_distance = reachable_distance;
400
                    
401
                    # insert element in queue O(n) - worst case.
402
                    index_insertion = len(order_seed);
403
                    for index_seed in range(0, len(order_seed)):
404
                        if (reachable_distance < order_seed[index_seed].reachability_distance):
405
                            index_insertion = index_seed;
406
                            break;
407
                    
408
                    order_seed.insert(index_insertion, self.__optics_objects[index_neighbor]);
409
410
                else:
411
                    if (reachable_distance < self.__optics_objects[index_neighbor].reachability_distance):
412
                        self.__optics_objects[index_neighbor].reachability_distance = reachable_distance;
413
                        order_seed.sort(key = lambda obj: obj.reachability_distance);
414
415
416
    def __neighbor_indexes(self, optic_object):
417
        """!
418
        @brief Return list of indexes of neighbors of specified point for the data.
419
        
420
        @param[in] optic_object (optics_descriptor): Object for which neighbors should be returned in line with connectivity radius.
421
        
422
        @return (list) List of indexes of neighbors in line the connectivity radius.
423
        
424
        """
425
              
426
        neighbor_description = [];
427
        
428
        for index in range(0, len(self.__sample_pointer), 1):
429
            if (index == optic_object.index_object):
430
                continue;
431
            
432
            distance = euclidean_distance(self.__sample_pointer[optic_object.index_object], self.__sample_pointer[index]);
433
            if (distance <= self.__eps):
434
                neighbor_description.append( [index, distance] );
435
            
436
        return neighbor_description;