Completed
Push — 0.7.dev ( e8eda8...b87d98 )
by Andrei
58s
created

agglomerative.__calculate_farthest_distance()   A

Complexity

Conditions 4

Size

Total Lines 19

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
dl 0
loc 19
rs 9.2
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: agglomerative algorithm.
4
@details Implementation based on book:
5
         - K.Anil, J.C.Dubes, R.C.Dubes. Algorithms for Clustering Data. 1988.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2017
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
from enum import IntEnum;
30
31
from pyclustering.cluster.encoder import type_encoding;
32
33
from pyclustering.utils import euclidean_distance_sqrt;
34
35
import pyclustering.core.agglomerative_wrapper as wrapper;
36
37
38
class type_link(IntEnum):
39
    """!
40
    @brief Enumerator of types of link between clusters.
41
    
42
    """
43
44
    ## The nearest objects in clusters is considered as a link.
45
    SINGLE_LINK = 0;
46
    
47
    ## The farthest objects in clusters is considered as a link.
48
    COMPLETE_LINK = 1;
49
    
50
    ## Average distance between objects in clusters is considered as a link.
51
    AVERAGE_LINK = 2;
52
    
53
    ## Distance between centers of clusters is considered as a link.
54
    CENTROID_LINK = 3;
55
56
57
class agglomerative:
58
    """!
59
    @brief Class represents agglomerative algorithm for cluster analysis.
60
    @details Agglomerative algorithm considers each data point (object) as a separate cluster at the beggining and
61
              step by step finds the best pair of clusters for merge until required amount of clusters is obtained.
62
    
63
    Example of agglomerative algorithm where centroid link is used:
64
    @code
65
        # sample for cluster analysis (represented by list)
66
        sample = read_sample(path_to_sample);
67
        
68
        # create object that uses python code only
69
        agglomerative_instance = agglomerative(sample, 2, link_type.CENTROID_LINK)
70
        
71
        # cluster analysis
72
        agglomerative_instance.process();
73
        
74
        # obtain results of clustering
75
        clusters = agglomerative_instance.get_clusters();  
76
    @endcode
77
    
78
    Algorithm performance can be improved if 'ccore' flag is on. In this case C++ library will be called for clustering.
79
    There is example of clustering 'LSUN' sample when usage of single or complete link will take a lot of resources and
80
    when core usage is prefereble.
81
    @code
82
        # sample Lsun for cluster analysis
83
        lsun_sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN);
84
        
85
        # create instance of the algorithm that will use ccore library (the last argument)
86
        agglomerative_instance = agglomerative(lsun_sample, 3, link_type.SINGLE_LINK, True);
87
        
88
        # start processing
89
        agglomerative_instance.process();
90
        
91
        # get result and visualize it
92
        lsun_clusters = agglomerative_instance.get_clusters();
93
        visualizer = cluster_visualizer();
94
        visualizer.append_clusters(lsun_clusters, lsun_sample);
95
        visualizer.show();
96
    @endcode
97
    
98
    Example of agglomerative clustering using different links:
99
    @image html agglomerative_lsun_clustering_single_link.png
100
    
101
    """
102
    
103
    def __init__(self, data, number_clusters, link = None, ccore = False):
104
        """!
105
        @brief Constructor of agglomerative hierarchical algorithm.
106
        
107
        @param[in] data (list): Input data that is presented as a list of points (objects), each point should be represented by list, for example
108
                    [[0.1, 0.2], [0.4, 0.5], [1.3, 0.9]].
109
        @param[in] number_clusters (uint): Number of clusters that should be allocated.
110
        @param[in] link (type_link): Link type that is used for calculation similarity between objects and clusters, if it is not specified centroid link will be used by default.
111
        @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not (by default it is 'False').
112
        
113
        """  
114
        
115
        self.__pointer_data = data;
116
        self.__number_clusters = number_clusters;
117
        self.__similarity = link;
118
        
119
        if (self.__similarity is None):
120
            self.__similarity = type_link.CENTROID_LINK;
121
        
122
        self.__clusters = [];
123
        self.__ccore = ccore;
124
        
125
        if (self.__similarity == type_link.CENTROID_LINK):
126
            self.__centers = self.__pointer_data.copy();    # used in case of usage of centroid links
127
    
128
    
129
    def process(self):
130
        """!
131
        @brief Performs cluster analysis in line with rules of agglomerative algorithm and similarity.
132
        
133
        @see get_clusters()
134
        
135
        """
136
        
137
        if (self.__ccore is True):
138
            self.__clusters = wrapper.agglomerative_algorithm(self.__pointer_data, self.__number_clusters, self.__similarity);
139
140
        else:
141
            self.__clusters = [[index] for index in range(0, len(self.__pointer_data))];
142
            
143
            current_number_clusters = len(self.__clusters);
144
                
145
            while (current_number_clusters > self.__number_clusters):
146
                self.__merge_similar_clusters();
147
                current_number_clusters = len(self.__clusters);
148
    
149
    
150
    def get_clusters(self):
151
        """!
152
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
153
        
154
        @remark Results of clustering can be obtained using corresponding gets methods.
155
        
156
        @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
157
        
158
        @see process()
159
        
160
        """
161
        
162
        return self.__clusters;
163
    
164
    
165
    def get_cluster_encoding(self):
166
        """!
167
        @brief Returns clustering result representation type that indicate how clusters are encoded.
168
        
169
        @return (type_encoding) Clustering result representation.
170
        
171
        @see get_clusters()
172
        
173
        """
174
        
175
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
176
    
177
    
178
    def __merge_similar_clusters(self):
179
        """!
180
        @brief Merges the most similar clusters in line with link type.
181
        
182
        """
183
        
184
        if (self.__similarity == type_link.AVERAGE_LINK):
185
            self.__merge_by_average_link();
186
        
187
        elif (self.__similarity == type_link.CENTROID_LINK):
188
            self.__merge_by_centroid_link();
189
        
190
        elif (self.__similarity == type_link.COMPLETE_LINK):
191
            self.__merge_by_complete_link();
192
        
193
        elif (self.__similarity == type_link.SINGLE_LINK):
194
            self.__merge_by_signle_link();
195
        
196
        else:
197
            raise NameError('Not supported similarity is used');
198
    
199
    
200
    def __merge_by_average_link(self):
201
        """!
202
        @brief Merges the most similar clusters in line with average link type.
203
        
204
        """
205
        
206
        minimum_average_distance = float('Inf');
207
        
208 View Code Duplication
        for index_cluster1 in range(0, len(self.__clusters)):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
209
            for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
210
                
211
                # Find farthest objects
212
                candidate_average_distance = 0.0;
213
                for index_object1 in self.__clusters[index_cluster1]:
214
                    for index_object2 in self.__clusters[index_cluster2]:
215
                        candidate_average_distance += euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
216
                
217
                candidate_average_distance /= (len(self.__clusters[index_cluster1]) + len(self.__clusters[index_cluster2]));
218
                
219
                if (candidate_average_distance < minimum_average_distance):
220
                    minimum_average_distance = candidate_average_distance;
221
                    indexes = [index_cluster1, index_cluster2];
222
        
223
        self.__clusters[indexes[0]] += self.__clusters[indexes[1]];  
224
        self.__clusters.pop(indexes[1]);   # remove merged cluster.  
225
        
226
    
227
    def __merge_by_centroid_link(self):
228
        """!
229
        @brief Merges the most similar clusters in line with centroid link type.
230
        
231
        """
232
        
233
        minimum_centroid_distance = float('Inf');
234
        indexes = None;
235
        
236
        for index1 in range(0, len(self.__centers)):
237
            for index2 in range(index1 + 1, len(self.__centers)):
238
                distance = euclidean_distance_sqrt(self.__centers[index1], self.__centers[index2]);
239
                if (distance < minimum_centroid_distance):
240
                    minimum_centroid_distance = distance;
241
                    indexes = [index1, index2];
242
        
243
        self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
244
        self.__centers[indexes[0]] = self.__calculate_center(self.__clusters[indexes[0]]);
245
         
246
        self.__clusters.pop(indexes[1]);   # remove merged cluster.
247
        self.__centers.pop(indexes[1]);    # remove merged center.
248
    
249
    
250 View Code Duplication
    def __merge_by_complete_link(self):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
251
        """!
252
        @brief Merges the most similar clusters in line with complete link type.
253
        
254
        """
255
        
256
        minimum_complete_distance = float('Inf');
257
        indexes = None;
258
        
259
        for index_cluster1 in range(0, len(self.__clusters)):
260
            for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
261
                candidate_maximum_distance = self.__calculate_farthest_distance(index_cluster1, index_cluster2);
262
                
263
                if (candidate_maximum_distance < minimum_complete_distance):
264
                    minimum_complete_distance = candidate_maximum_distance;
265
                    indexes = [index_cluster1, index_cluster2];
266
267
        self.__clusters[indexes[0]] += self.__clusters[indexes[1]];  
268
        self.__clusters.pop(indexes[1]);   # remove merged cluster.        
269
    
270
    
271
    def __calculate_farthest_distance(self, index_cluster1, index_cluster2):
272
        """!
273
        @brief Finds two farthest objects in two specified clusters in terms and returns distance between them.
274
        
275
        @param[in] (uint) Index of the first cluster.
276
        @param[in] (uint) Index of the second cluster.
277
        
278
        @return The farthest euclidean distance between two clusters.
279
        
280
        """
281
        candidate_maximum_distance = 0.0;
282
        for index_object1 in self.__clusters[index_cluster1]:
283
            for index_object2 in self.__clusters[index_cluster2]:
284
                distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
285
                
286
                if (distance > candidate_maximum_distance):
287
                    candidate_maximum_distance = distance;
288
    
289
        return candidate_maximum_distance;
290
    
291
    
292
    def __merge_by_signle_link(self):
293
        """!
294
        @brief Merges the most similar clusters in line with single link type.
295
        
296
        """
297
        
298
        minimum_single_distance = float('Inf');
299
        indexes = None;
300
        
301
        for index_cluster1 in range(0, len(self.__clusters)):
302
            for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
303
                candidate_minimum_distance = self.__calculate_nearest_distance(index_cluster1, index_cluster2);
304
                
305
                if (candidate_minimum_distance < minimum_single_distance):
306
                    minimum_single_distance = candidate_minimum_distance;
307
                    indexes = [index_cluster1, index_cluster2];
308
309
        self.__clusters[indexes[0]] += self.__clusters[indexes[1]];  
310
        self.__clusters.pop(indexes[1]);   # remove merged cluster.
311
    
312
    
313
    def __calculate_nearest_distance(self, index_cluster1, index_cluster2):
314
        """!
315
        @brief Finds two nearest objects in two specified clusters and returns distance between them.
316
        
317
        @param[in] (uint) Index of the first cluster.
318
        @param[in] (uint) Index of the second cluster.
319
        
320
        @return The nearest euclidean distance between two clusters.
321
        
322
        """
323
        candidate_minimum_distance = float('Inf');
324
        
325
        for index_object1 in self.__clusters[index_cluster1]:
326
            for index_object2 in self.__clusters[index_cluster2]:
327
                distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
328
                if (distance < candidate_minimum_distance):
329
                    candidate_minimum_distance = distance;
330
        
331
        return candidate_minimum_distance;
332
    
333
    
334
    def __calculate_center(self, cluster):
335
        """!
336
        @brief Calculates new center.
337
        
338
        @return (list) New value of the center of the specified cluster.
339
        
340
        """
341
         
342
        dimension = len(self.__pointer_data[cluster[0]]);
343
        center = [0] * dimension;
344
        for index_point in cluster:
345
            for index_dimension in range(0, dimension):
346
                center[index_dimension] += self.__pointer_data[index_point][index_dimension];
347
         
348
        for index_dimension in range(0, dimension):
349
            center[index_dimension] /= len(cluster);
350
             
351
        return center;