Completed
Push — master ( 6ed467...f3f897 )
by Andrei
01:29
created

agglomerative.get_cluster_encoding()   A

Complexity

Conditions 1

Size

Total Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
dl 0
loc 11
rs 9.4285
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: agglomerative algorithm.
4
@details Implementation based on book:
5
         - K.Anil, J.C.Dubes, R.C.Dubes. Algorithms for Clustering Data. 1988.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2016
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
from enum import IntEnum;
30
31
from pyclustering.cluster.encoder import type_encoding;
32
33
from pyclustering.utils import euclidean_distance_sqrt;
34
35
import pyclustering.core.agglomerative_wrapper as wrapper;
36
37
38
class type_link(IntEnum):
39
    """!
40
    @brief Enumerator of types of link between clusters.
41
    
42
    """
43
44
    ## Nearest objects in clusters is considered as a link.
45
    SINGLE_LINK = 0;
46
    
47
    ## Farthest objects in clusters is considered as a link.
48
    COMPLETE_LINK = 1;
49
    
50
    ## Average distance between objects in clusters is considered as a link.
51
    AVERAGE_LINK = 2;
52
    
53
    ## Distance between centers of clusters is considered as a link.
54
    CENTROID_LINK = 3;
55
56
57
class agglomerative:
58
    """!
59
    @brief Class represents agglomerative algorithm for cluster analysis.
60
    
61
    Example:
62
    @code
63
        # sample for cluster analysis (represented by list)
64
        sample = read_sample(path_to_sample);
65
        
66
        # create object that uses python code only
67
        agglomerative_instance = agglomerative(sample, 2, link_type.CENTROID_LINK)
68
        
69
        # cluster analysis
70
        agglomerative_instance.process();
71
        
72
        # obtain results of clustering
73
        clusters = agglomerative_instance.get_clusters();  
74
    @endcode
75
    
76
    """
77
    
78
    def __init__(self, data, number_clusters, link = None, ccore = False):
79
        """!
80
        @brief Constructor of agglomerative hierarchical algorithm.
81
        
82
        @param[in] data (list): Input data that is presented as a list of points (objects), each point should be represented by a list or tuple.
83
        @param[in] number_clusters (uint): Number of clusters that should be allocated.
84
        @param[in] link (type_link): Link type that is used for calculation similarity between objects and clusters, if it is not specified centroid link will be used by default.
85
        @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
86
        
87
        """  
88
        
89
        self.__pointer_data = data;
90
        self.__number_clusters = number_clusters;
91
        self.__similarity = link;
92
        
93
        if (self.__similarity is None):
94
            self.__similarity = type_link.CENTROID_LINK;
95
        
96
        self.__clusters = [];
97
        self.__ccore = ccore;
98
        
99
        if (self.__similarity == type_link.CENTROID_LINK):
100
            self.__centers = self.__pointer_data.copy();    # used in case of usage of centroid links
101
    
102
    
103
    def process(self):
104
        """!
105
        @brief Performs cluster analysis in line with rules of agglomerative algorithm and similarity.
106
        
107
        @see get_clusters()
108
        
109
        """
110
        
111
        if (self.__ccore is True):
112
            self.__clusters = wrapper.agglomerative_algorithm(self.__pointer_data, self.__number_clusters, self.__similarity);
113
114
        else:
115
            self.__clusters = [[index] for index in range(0, len(self.__pointer_data))];
116
            
117
            current_number_clusters = len(self.__clusters);
118
                
119
            while (current_number_clusters > self.__number_clusters):
120
                self.__merge_similar_clusters();
121
                current_number_clusters = len(self.__clusters);
122
    
123
    
124
    def get_clusters(self):
125
        """!
126
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
127
        
128
        @remark Results of clustering can be obtained using corresponding gets methods.
129
        
130
        @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
131
        
132
        @see process()
133
        
134
        """
135
        
136
        return self.__clusters;
137
    
138
    
139
    def get_cluster_encoding(self):
140
        """!
141
        @brief Returns clustering result representation type that indicate how clusters are encoded.
142
        
143
        @return (type_encoding) Clustering result representation.
144
        
145
        @see get_clusters()
146
        
147
        """
148
        
149
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
150
    
151
    
152
    def __merge_similar_clusters(self):
153
        """!
154
        @brief Merges the most similar clusters in line with link type.
155
        
156
        """
157
        
158
        if (self.__similarity == type_link.AVERAGE_LINK):
159
            self.__merge_by_average_link();
160
        
161
        elif (self.__similarity == type_link.CENTROID_LINK):
162
            self.__merge_by_centroid_link();
163
        
164
        elif (self.__similarity == type_link.COMPLETE_LINK):
165
            self.__merge_by_complete_link();
166
        
167
        elif (self.__similarity == type_link.SINGLE_LINK):
168
            self.__merge_by_signle_link();
169
        
170
        else:
171
            raise NameError('Not supported similarity is used');
172
    
173
    
174
    def __merge_by_average_link(self):
175
        """!
176
        @brief Merges the most similar clusters in line with average link type.
177
        
178
        """
179
        
180
        minimum_average_distance = float('Inf');
181
        
182
        for index_cluster1 in range(0, len(self.__clusters)):
183
            for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
184
                
185
                # Find farthest objects
186
                candidate_average_distance = 0.0;
187
                for index_object1 in self.__clusters[index_cluster1]:
188
                    for index_object2 in self.__clusters[index_cluster2]:
189
                        candidate_average_distance += euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
190
                
191
                candidate_average_distance /= (len(self.__clusters[index_cluster1]) + len(self.__clusters[index_cluster2]));
192
                
193
                if (candidate_average_distance < minimum_average_distance):
194
                    minimum_average_distance = candidate_average_distance;
195
                    indexes = [index_cluster1, index_cluster2];
196
        
197
        self.__clusters[indexes[0]] += self.__clusters[indexes[1]];  
198
        self.__clusters.pop(indexes[1]);   # remove merged cluster.  
199
        
200
    
201
    def __merge_by_centroid_link(self):
202
        """!
203
        @brief Merges the most similar clusters in line with centroid link type.
204
        
205
        """
206
        
207
        minimum_centroid_distance = float('Inf');
208 View Code Duplication
        indexes = None;
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
209
        
210
        for index1 in range(0, len(self.__centers)):
211
            for index2 in range(index1 + 1, len(self.__centers)):
212
                distance = euclidean_distance_sqrt(self.__centers[index1], self.__centers[index2]);
213
                if (distance < minimum_centroid_distance):
214
                    minimum_centroid_distance = distance;
215
                    indexes = [index1, index2];
216
        
217
        self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
218
        self.__centers[indexes[0]] = self.__calculate_center(self.__clusters[indexes[0]]);
219
         
220
        self.__clusters.pop(indexes[1]);   # remove merged cluster.
221
        self.__centers.pop(indexes[1]);    # remove merged center.
222
    
223
    
224
    def __merge_by_complete_link(self):
225
        """!
226
        @brief Merges the most similar clusters in line with complete link type.
227
        
228
        """
229
        
230
        minimum_complete_distance = float('Inf');
231
        indexes = None;
232
        
233
        for index_cluster1 in range(0, len(self.__clusters)):
234
            for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
235
                candidate_maximum_distance = self.__calculate_farthest_distance(index_cluster1, index_cluster2);
236
                
237
                if (candidate_maximum_distance < minimum_complete_distance):
238
                    minimum_complete_distance = candidate_maximum_distance;
239
                    indexes = [index_cluster1, index_cluster2];
240
241
        self.__clusters[indexes[0]] += self.__clusters[indexes[1]];  
242
        self.__clusters.pop(indexes[1]);   # remove merged cluster.        
243
    
244
    
245
    def __calculate_farthest_distance(self, index_cluster1, index_cluster2):
246
        """!
247
        @brief Finds two farthest objects in two specified clusters in terms and returns distance between them.
248
        
249
        @param[in] (uint) Index of the first cluster.
250 View Code Duplication
        @param[in] (uint) Index of the second cluster.
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
251
        
252
        @return The farthest euclidean distance between two clusters.
253
        
254
        """
255
        candidate_maximum_distance = 0.0;
256
        for index_object1 in self.__clusters[index_cluster1]:
257
            for index_object2 in self.__clusters[index_cluster2]:
258
                distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
259
                
260
                if (distance > candidate_maximum_distance):
261
                    candidate_maximum_distance = distance;
262
    
263
        return candidate_maximum_distance;
264
    
265
    
266
    def __merge_by_signle_link(self):
267
        """!
268
        @brief Merges the most similar clusters in line with single link type.
269
        
270
        """
271
        
272
        minimum_single_distance = float('Inf');
273
        indexes = None;
274
        
275
        for index_cluster1 in range(0, len(self.__clusters)):
276
            for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
277
                candidate_minimum_distance = self.__calculate_nearest_distance(index_cluster1, index_cluster2);
278
                
279
                if (candidate_minimum_distance < minimum_single_distance):
280
                    minimum_single_distance = candidate_minimum_distance;
281
                    indexes = [index_cluster1, index_cluster2];
282
283
        self.__clusters[indexes[0]] += self.__clusters[indexes[1]];  
284
        self.__clusters.pop(indexes[1]);   # remove merged cluster.
285
    
286
    
287
    def __calculate_nearest_distance(self, index_cluster1, index_cluster2):
288
        """!
289
        @brief Finds two nearest objects in two specified clusters and returns distance between them.
290
        
291
        @param[in] (uint) Index of the first cluster.
292
        @param[in] (uint) Index of the second cluster.
293
        
294
        @return The nearest euclidean distance between two clusters.
295
        
296
        """
297
        candidate_minimum_distance = float('Inf');
298
        
299
        for index_object1 in self.__clusters[index_cluster1]:
300
            for index_object2 in self.__clusters[index_cluster2]:
301
                distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
302
                if (distance < candidate_minimum_distance):
303
                    candidate_minimum_distance = distance;
304
        
305
        return candidate_minimum_distance;
306
    
307
    
308
    def __calculate_center(self, cluster):
309
        """!
310
        @brief Calculates new center.
311
        
312
        @return (list) New value of the center of the specified cluster.
313
        
314
        """
315
         
316
        dimension = len(self.__pointer_data[cluster[0]]);
317
        center = [0] * dimension;
318
        for index_point in cluster:
319
            for index_dimension in range(0, dimension):
320
                center[index_dimension] += self.__pointer_data[index_point][index_dimension];
321
         
322
        for index_dimension in range(0, dimension):
323
            center[index_dimension] /= len(cluster);
324
             
325
        return center;