Total Complexity | 44 |
Total Lines | 295 |
Duplicated Lines | 12.88 % |
Changes | 0 |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like agglomerative often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | """! |
||
57 | class agglomerative: |
||
58 | """! |
||
59 | @brief Class represents agglomerative algorithm for cluster analysis. |
||
60 | @details Agglomerative algorithm considers each data point (object) as a separate cluster at the beggining and |
||
61 | step by step finds the best pair of clusters for merge until required amount of clusters is obtained. |
||
62 | |||
63 | Example of agglomerative algorithm where centroid link is used: |
||
64 | @code |
||
65 | # sample for cluster analysis (represented by list) |
||
66 | sample = read_sample(path_to_sample); |
||
67 | |||
68 | # create object that uses python code only |
||
69 | agglomerative_instance = agglomerative(sample, 2, link_type.CENTROID_LINK) |
||
70 | |||
71 | # cluster analysis |
||
72 | agglomerative_instance.process(); |
||
73 | |||
74 | # obtain results of clustering |
||
75 | clusters = agglomerative_instance.get_clusters(); |
||
76 | @endcode |
||
77 | |||
78 | Algorithm performance can be improved if 'ccore' flag is on. In this case C++ library will be called for clustering. |
||
79 | There is example of clustering 'LSUN' sample when usage of single or complete link will take a lot of resources and |
||
80 | when core usage is prefereble. |
||
81 | @code |
||
82 | # sample Lsun for cluster analysis |
||
83 | lsun_sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN); |
||
84 | |||
85 | # create instance of the algorithm that will use ccore library (the last argument) |
||
86 | agglomerative_instance = agglomerative(lsun_sample, 3, link_type.SINGLE_LINK, True); |
||
87 | |||
88 | # start processing |
||
89 | agglomerative_instance.process(); |
||
90 | |||
91 | # get result and visualize it |
||
92 | lsun_clusters = agglomerative_instance.get_clusters(); |
||
93 | visualizer = cluster_visualizer(); |
||
94 | visualizer.append_clusters(lsun_clusters, lsun_sample); |
||
95 | visualizer.show(); |
||
96 | @endcode |
||
97 | |||
98 | Example of agglomerative clustering using different links: |
||
99 | @image html agglomerative_lsun_clustering_single_link.png |
||
100 | |||
101 | """ |
||
102 | |||
103 | def __init__(self, data, number_clusters, link = None, ccore = False): |
||
104 | """! |
||
105 | @brief Constructor of agglomerative hierarchical algorithm. |
||
106 | |||
107 | @param[in] data (list): Input data that is presented as a list of points (objects), each point should be represented by list, for example |
||
108 | [[0.1, 0.2], [0.4, 0.5], [1.3, 0.9]]. |
||
109 | @param[in] number_clusters (uint): Number of clusters that should be allocated. |
||
110 | @param[in] link (type_link): Link type that is used for calculation similarity between objects and clusters, if it is not specified centroid link will be used by default. |
||
111 | @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not (by default it is 'False'). |
||
112 | |||
113 | """ |
||
114 | |||
115 | self.__pointer_data = data; |
||
116 | self.__number_clusters = number_clusters; |
||
117 | self.__similarity = link; |
||
118 | |||
119 | if (self.__similarity is None): |
||
120 | self.__similarity = type_link.CENTROID_LINK; |
||
121 | |||
122 | self.__clusters = []; |
||
123 | self.__ccore = ccore; |
||
124 | |||
125 | if (self.__similarity == type_link.CENTROID_LINK): |
||
126 | self.__centers = self.__pointer_data.copy(); # used in case of usage of centroid links |
||
127 | |||
128 | |||
129 | def process(self): |
||
130 | """! |
||
131 | @brief Performs cluster analysis in line with rules of agglomerative algorithm and similarity. |
||
132 | |||
133 | @see get_clusters() |
||
134 | |||
135 | """ |
||
136 | |||
137 | if (self.__ccore is True): |
||
138 | self.__clusters = wrapper.agglomerative_algorithm(self.__pointer_data, self.__number_clusters, self.__similarity); |
||
139 | |||
140 | else: |
||
141 | self.__clusters = [[index] for index in range(0, len(self.__pointer_data))]; |
||
142 | |||
143 | current_number_clusters = len(self.__clusters); |
||
144 | |||
145 | while (current_number_clusters > self.__number_clusters): |
||
146 | self.__merge_similar_clusters(); |
||
147 | current_number_clusters = len(self.__clusters); |
||
148 | |||
149 | |||
150 | def get_clusters(self): |
||
151 | """! |
||
152 | @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data. |
||
153 | |||
154 | @remark Results of clustering can be obtained using corresponding gets methods. |
||
155 | |||
156 | @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data. |
||
157 | |||
158 | @see process() |
||
159 | |||
160 | """ |
||
161 | |||
162 | return self.__clusters; |
||
163 | |||
164 | |||
165 | def get_cluster_encoding(self): |
||
166 | """! |
||
167 | @brief Returns clustering result representation type that indicate how clusters are encoded. |
||
168 | |||
169 | @return (type_encoding) Clustering result representation. |
||
170 | |||
171 | @see get_clusters() |
||
172 | |||
173 | """ |
||
174 | |||
175 | return type_encoding.CLUSTER_INDEX_LIST_SEPARATION; |
||
176 | |||
177 | |||
178 | def __merge_similar_clusters(self): |
||
179 | """! |
||
180 | @brief Merges the most similar clusters in line with link type. |
||
181 | |||
182 | """ |
||
183 | |||
184 | if (self.__similarity == type_link.AVERAGE_LINK): |
||
185 | self.__merge_by_average_link(); |
||
186 | |||
187 | elif (self.__similarity == type_link.CENTROID_LINK): |
||
188 | self.__merge_by_centroid_link(); |
||
189 | |||
190 | elif (self.__similarity == type_link.COMPLETE_LINK): |
||
191 | self.__merge_by_complete_link(); |
||
192 | |||
193 | elif (self.__similarity == type_link.SINGLE_LINK): |
||
194 | self.__merge_by_signle_link(); |
||
195 | |||
196 | else: |
||
197 | raise NameError('Not supported similarity is used'); |
||
198 | |||
199 | |||
200 | def __merge_by_average_link(self): |
||
201 | """! |
||
202 | @brief Merges the most similar clusters in line with average link type. |
||
203 | |||
204 | """ |
||
205 | |||
206 | minimum_average_distance = float('Inf'); |
||
207 | |||
208 | View Code Duplication | for index_cluster1 in range(0, len(self.__clusters)): |
|
|
|||
209 | for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)): |
||
210 | |||
211 | # Find farthest objects |
||
212 | candidate_average_distance = 0.0; |
||
213 | for index_object1 in self.__clusters[index_cluster1]: |
||
214 | for index_object2 in self.__clusters[index_cluster2]: |
||
215 | candidate_average_distance += euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]); |
||
216 | |||
217 | candidate_average_distance /= (len(self.__clusters[index_cluster1]) + len(self.__clusters[index_cluster2])); |
||
218 | |||
219 | if (candidate_average_distance < minimum_average_distance): |
||
220 | minimum_average_distance = candidate_average_distance; |
||
221 | indexes = [index_cluster1, index_cluster2]; |
||
222 | |||
223 | self.__clusters[indexes[0]] += self.__clusters[indexes[1]]; |
||
224 | self.__clusters.pop(indexes[1]); # remove merged cluster. |
||
225 | |||
226 | |||
227 | def __merge_by_centroid_link(self): |
||
228 | """! |
||
229 | @brief Merges the most similar clusters in line with centroid link type. |
||
230 | |||
231 | """ |
||
232 | |||
233 | minimum_centroid_distance = float('Inf'); |
||
234 | indexes = None; |
||
235 | |||
236 | for index1 in range(0, len(self.__centers)): |
||
237 | for index2 in range(index1 + 1, len(self.__centers)): |
||
238 | distance = euclidean_distance_sqrt(self.__centers[index1], self.__centers[index2]); |
||
239 | if (distance < minimum_centroid_distance): |
||
240 | minimum_centroid_distance = distance; |
||
241 | indexes = [index1, index2]; |
||
242 | |||
243 | self.__clusters[indexes[0]] += self.__clusters[indexes[1]]; |
||
244 | self.__centers[indexes[0]] = self.__calculate_center(self.__clusters[indexes[0]]); |
||
245 | |||
246 | self.__clusters.pop(indexes[1]); # remove merged cluster. |
||
247 | self.__centers.pop(indexes[1]); # remove merged center. |
||
248 | |||
249 | |||
250 | View Code Duplication | def __merge_by_complete_link(self): |
|
251 | """! |
||
252 | @brief Merges the most similar clusters in line with complete link type. |
||
253 | |||
254 | """ |
||
255 | |||
256 | minimum_complete_distance = float('Inf'); |
||
257 | indexes = None; |
||
258 | |||
259 | for index_cluster1 in range(0, len(self.__clusters)): |
||
260 | for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)): |
||
261 | candidate_maximum_distance = self.__calculate_farthest_distance(index_cluster1, index_cluster2); |
||
262 | |||
263 | if (candidate_maximum_distance < minimum_complete_distance): |
||
264 | minimum_complete_distance = candidate_maximum_distance; |
||
265 | indexes = [index_cluster1, index_cluster2]; |
||
266 | |||
267 | self.__clusters[indexes[0]] += self.__clusters[indexes[1]]; |
||
268 | self.__clusters.pop(indexes[1]); # remove merged cluster. |
||
269 | |||
270 | |||
271 | def __calculate_farthest_distance(self, index_cluster1, index_cluster2): |
||
272 | """! |
||
273 | @brief Finds two farthest objects in two specified clusters in terms and returns distance between them. |
||
274 | |||
275 | @param[in] (uint) Index of the first cluster. |
||
276 | @param[in] (uint) Index of the second cluster. |
||
277 | |||
278 | @return The farthest euclidean distance between two clusters. |
||
279 | |||
280 | """ |
||
281 | candidate_maximum_distance = 0.0; |
||
282 | for index_object1 in self.__clusters[index_cluster1]: |
||
283 | for index_object2 in self.__clusters[index_cluster2]: |
||
284 | distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]); |
||
285 | |||
286 | if (distance > candidate_maximum_distance): |
||
287 | candidate_maximum_distance = distance; |
||
288 | |||
289 | return candidate_maximum_distance; |
||
290 | |||
291 | |||
292 | def __merge_by_signle_link(self): |
||
293 | """! |
||
294 | @brief Merges the most similar clusters in line with single link type. |
||
295 | |||
296 | """ |
||
297 | |||
298 | minimum_single_distance = float('Inf'); |
||
299 | indexes = None; |
||
300 | |||
301 | for index_cluster1 in range(0, len(self.__clusters)): |
||
302 | for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)): |
||
303 | candidate_minimum_distance = self.__calculate_nearest_distance(index_cluster1, index_cluster2); |
||
304 | |||
305 | if (candidate_minimum_distance < minimum_single_distance): |
||
306 | minimum_single_distance = candidate_minimum_distance; |
||
307 | indexes = [index_cluster1, index_cluster2]; |
||
308 | |||
309 | self.__clusters[indexes[0]] += self.__clusters[indexes[1]]; |
||
310 | self.__clusters.pop(indexes[1]); # remove merged cluster. |
||
311 | |||
312 | |||
313 | def __calculate_nearest_distance(self, index_cluster1, index_cluster2): |
||
314 | """! |
||
315 | @brief Finds two nearest objects in two specified clusters and returns distance between them. |
||
316 | |||
317 | @param[in] (uint) Index of the first cluster. |
||
318 | @param[in] (uint) Index of the second cluster. |
||
319 | |||
320 | @return The nearest euclidean distance between two clusters. |
||
321 | |||
322 | """ |
||
323 | candidate_minimum_distance = float('Inf'); |
||
324 | |||
325 | for index_object1 in self.__clusters[index_cluster1]: |
||
326 | for index_object2 in self.__clusters[index_cluster2]: |
||
327 | distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]); |
||
328 | if (distance < candidate_minimum_distance): |
||
329 | candidate_minimum_distance = distance; |
||
330 | |||
331 | return candidate_minimum_distance; |
||
332 | |||
333 | |||
334 | def __calculate_center(self, cluster): |
||
335 | """! |
||
336 | @brief Calculates new center. |
||
337 | |||
338 | @return (list) New value of the center of the specified cluster. |
||
339 | |||
340 | """ |
||
341 | |||
342 | dimension = len(self.__pointer_data[cluster[0]]); |
||
343 | center = [0] * dimension; |
||
344 | for index_point in cluster: |
||
345 | for index_dimension in range(0, dimension): |
||
346 | center[index_dimension] += self.__pointer_data[index_point][index_dimension]; |
||
347 | |||
348 | for index_dimension in range(0, dimension): |
||
349 | center[index_dimension] /= len(cluster); |
||
350 | |||
351 | return center; |