Completed
Push — master ( 1d669d...037fe6 )
by Andrei
01:33
created

agglomerative.__calculate_center()   A

Complexity

Conditions 4

Size

Total Lines 18

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 4
dl 0
loc 18
rs 9.2
1
"""!
2
3
@brief Cluster analysis algorithm: agglomerative algorithm.
4
@details Implementation based on book:
5
         - K.Anil, J.C.Dubes, R.C.Dubes. Algorithms for Clustering Data. 1988.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2016
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
from enum import IntEnum;
29
30
from pyclustering.utils import euclidean_distance_sqrt;
31
32
import pyclustering.core.agglomerative_wrapper as wrapper;
33
34
35
class type_link(IntEnum):
36
    """!
37
    @brief Enumerator of types of link between clusters.
38
    
39
    """
40
41
    ## Nearest objects in clusters is considered as a link.
42
    SINGLE_LINK = 0;
43
    
44
    ## Farthest objects in clusters is considered as a link.
45
    COMPLETE_LINK = 1;
46
    
47
    ## Average distance between objects in clusters is considered as a link.
48
    AVERAGE_LINK = 2;
49
    
50
    ## Distance between centers of clusters is considered as a link.
51
    CENTROID_LINK = 3;
52
53
54
class agglomerative:
55
    """!
56
    @brief Class represents agglomerative algorithm for cluster analysis.
57
    
58
    Example:
59
    @code
60
        # sample for cluster analysis (represented by list)
61
        sample = read_sample(path_to_sample);
62
        
63
        # create object that uses python code only
64
        agglomerative_instance = agglomerative(sample, 2, link_type.CENTROID_LINK)
65
        
66
        # cluster analysis
67
        agglomerative_instance.process();
68
        
69
        # obtain results of clustering
70
        clusters = agglomerative_instance.get_clusters();  
71
    @endcode
72
    
73
    """
74
    
75
    def __init__(self, data, number_clusters, link = None, ccore = False):
76
        """!
77
        @brief Constructor of agglomerative hierarchical algorithm.
78
        
79
        @param[in] data (list): Input data that is presented as a list of points (objects), each point should be represented by a list or tuple.
80
        @param[in] number_clusters (uint): Number of clusters that should be allocated.
81
        @param[in] link (type_link): Link type that is used for calculation similarity between objects and clusters, if it is not specified centroid link will be used by default.
82
        @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
83
        
84
        """  
85
        
86
        self.__pointer_data = data;
87
        self.__number_clusters = number_clusters;
88
        self.__similarity = link;
89
        
90
        if (self.__similarity is None):
91
            self.__similarity = type_link.CENTROID_LINK;
92
        
93
        self.__clusters = [];
94
        self.__ccore = ccore;
95
        
96
        if (self.__similarity == type_link.CENTROID_LINK):
97
            self.__centers = self.__pointer_data.copy();    # used in case of usage of centroid links
98
    
99
    
100
    def process(self):
101
        """!
102
        @brief Performs cluster analysis in line with rules of agglomerative algorithm and similarity.
103
        
104
        @see get_clusters()
105
        
106
        """
107
        
108
        if (self.__ccore is True):
109
            self.__clusters = wrapper.agglomerative_algorithm(self.__pointer_data, self.__number_clusters, self.__similarity);
110
111
        else:
112
            self.__clusters = [[index] for index in range(0, len(self.__pointer_data))];
113
            
114
            current_number_clusters = len(self.__clusters);
115
                
116
            while (current_number_clusters > self.__number_clusters):
117
                self.__merge_similar_clusters();
118
                current_number_clusters = len(self.__clusters);
119
    
120
    
121
    def get_clusters(self):
122
        """!
123
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
124
        
125
        @remark Results of clustering can be obtained using corresponding gets methods.
126
        
127
        @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
128
        
129
        @see process()
130
        
131
        """
132
        
133
        return self.__clusters;
134
    
135
    
136
    def __merge_similar_clusters(self):
137
        """!
138
        @brief Merges the most similar clusters in line with link type.
139
        
140
        """
141
        
142
        if (self.__similarity == type_link.AVERAGE_LINK):
143
            self.__merge_by_average_link();
144
        
145
        elif (self.__similarity == type_link.CENTROID_LINK):
146
            self.__merge_by_centroid_link();
147
        
148
        elif (self.__similarity == type_link.COMPLETE_LINK):
149
            self.__merge_by_complete_link();
150
        
151
        elif (self.__similarity == type_link.SINGLE_LINK):
152
            self.__merge_by_signle_link();
153
        
154
        else:
155
            raise NameError('Not supported similarity is used');
156
    
157
    
158
    def __merge_by_average_link(self):
159
        """!
160
        @brief Merges the most similar clusters in line with average link type.
161
        
162
        """
163
        
164
        minimum_average_distance = float('Inf');
165
        
166
        for index_cluster1 in range(0, len(self.__clusters)):
167
            for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
168
                
169
                # Find farthest objects
170
                candidate_average_distance = 0.0;
171
                for index_object1 in self.__clusters[index_cluster1]:
172
                    for index_object2 in self.__clusters[index_cluster2]:
173
                        candidate_average_distance += euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
174
                
175
                candidate_average_distance /= (len(self.__clusters[index_cluster1]) + len(self.__clusters[index_cluster2]));
176
                
177
                if (candidate_average_distance < minimum_average_distance):
178
                    minimum_average_distance = candidate_average_distance;
179
                    indexes = [index_cluster1, index_cluster2];
180
        
181
        self.__clusters[indexes[0]] += self.__clusters[indexes[1]];  
182
        self.__clusters.pop(indexes[1]);   # remove merged cluster.  
183
        
184
    
185
    def __merge_by_centroid_link(self):
186
        """!
187
        @brief Merges the most similar clusters in line with centroid link type.
188
        
189
        """
190
        
191
        minimum_centroid_distance = float('Inf');
192
        indexes = None;
193
        
194
        for index1 in range(0, len(self.__centers)):
195
            for index2 in range(index1 + 1, len(self.__centers)):
196
                distance = euclidean_distance_sqrt(self.__centers[index1], self.__centers[index2]);
197
                if (distance < minimum_centroid_distance):
198
                    minimum_centroid_distance = distance;
199
                    indexes = [index1, index2];
200
        
201
        self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
202
        self.__centers[indexes[0]] = self.__calculate_center(self.__clusters[indexes[0]]);
203
         
204
        self.__clusters.pop(indexes[1]);   # remove merged cluster.
205
        self.__centers.pop(indexes[1]);    # remove merged center.
206
    
207
    
208 View Code Duplication
    def __merge_by_complete_link(self):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
209
        """!
210
        @brief Merges the most similar clusters in line with complete link type.
211
        
212
        """
213
        
214
        minimum_complete_distance = float('Inf');
215
        indexes = None;
216
        
217
        for index_cluster1 in range(0, len(self.__clusters)):
218
            for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
219
                candidate_maximum_distance = self.__calculate_farthest_distance(index_cluster1, index_cluster2);
220
                
221
                if (candidate_maximum_distance < minimum_complete_distance):
222
                    minimum_complete_distance = candidate_maximum_distance;
223
                    indexes = [index_cluster1, index_cluster2];
224
225
        self.__clusters[indexes[0]] += self.__clusters[indexes[1]];  
226
        self.__clusters.pop(indexes[1]);   # remove merged cluster.        
227
    
228
    
229
    def __calculate_farthest_distance(self, index_cluster1, index_cluster2):
230
        """!
231
        @brief Finds two farthest objects in two specified clusters in terms and returns distance between them.
232
        
233
        @param[in] (uint) Index of the first cluster.
234
        @param[in] (uint) Index of the second cluster.
235
        
236
        @return The farthest euclidean distance between two clusters.
237
        
238
        """
239
        candidate_maximum_distance = 0.0;
240
        for index_object1 in self.__clusters[index_cluster1]:
241
            for index_object2 in self.__clusters[index_cluster2]:
242
                distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
243
                
244
                if (distance > candidate_maximum_distance):
245
                    candidate_maximum_distance = distance;
246
    
247
        return candidate_maximum_distance;
248
    
249
    
250 View Code Duplication
    def __merge_by_signle_link(self):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
251
        """!
252
        @brief Merges the most similar clusters in line with single link type.
253
        
254
        """
255
        
256
        minimum_single_distance = float('Inf');
257
        indexes = None;
258
        
259
        for index_cluster1 in range(0, len(self.__clusters)):
260
            for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
261
                candidate_minimum_distance = self.__calculate_nearest_distance(index_cluster1, index_cluster2);
262
                
263
                if (candidate_minimum_distance < minimum_single_distance):
264
                    minimum_single_distance = candidate_minimum_distance;
265
                    indexes = [index_cluster1, index_cluster2];
266
267
        self.__clusters[indexes[0]] += self.__clusters[indexes[1]];  
268
        self.__clusters.pop(indexes[1]);   # remove merged cluster.
269
    
270
    
271
    def __calculate_nearest_distance(self, index_cluster1, index_cluster2):
272
        """!
273
        @brief Finds two nearest objects in two specified clusters and returns distance between them.
274
        
275
        @param[in] (uint) Index of the first cluster.
276
        @param[in] (uint) Index of the second cluster.
277
        
278
        @return The nearest euclidean distance between two clusters.
279
        
280
        """
281
        candidate_minimum_distance = float('Inf');
282
        
283
        for index_object1 in self.__clusters[index_cluster1]:
284
            for index_object2 in self.__clusters[index_cluster2]:
285
                distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
286
                if (distance < candidate_minimum_distance):
287
                    candidate_minimum_distance = distance;
288
        
289
        return candidate_minimum_distance;
290
    
291
    
292
    def __calculate_center(self, cluster):
293
        """!
294
        @brief Calculates new center.
295
        
296
        @return (list) New value of the center of the specified cluster.
297
        
298
        """
299
         
300
        dimension = len(self.__pointer_data[cluster[0]]);
301
        center = [0] * dimension;
302
        for index_point in cluster:
303
            for index_dimension in range(0, dimension):
304
                center[index_dimension] += self.__pointer_data[index_point][index_dimension];
305
         
306
        for index_dimension in range(0, dimension):
307
            center[index_dimension] /= len(cluster);
308
             
309
        return center;