| Total Complexity | 44 |
| Total Lines | 295 |
| Duplicated Lines | 12.88 % |
| Changes | 0 | ||
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like agglomerative often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | """! |
||
| 57 | class agglomerative: |
||
| 58 | """! |
||
| 59 | @brief Class represents agglomerative algorithm for cluster analysis. |
||
| 60 | @details Agglomerative algorithm considers each data point (object) as a separate cluster at the beggining and |
||
| 61 | step by step finds the best pair of clusters for merge until required amount of clusters is obtained. |
||
| 62 | |||
| 63 | Example of agglomerative algorithm where centroid link is used: |
||
| 64 | @code |
||
| 65 | # sample for cluster analysis (represented by list) |
||
| 66 | sample = read_sample(path_to_sample); |
||
| 67 | |||
| 68 | # create object that uses python code only |
||
| 69 | agglomerative_instance = agglomerative(sample, 2, link_type.CENTROID_LINK) |
||
| 70 | |||
| 71 | # cluster analysis |
||
| 72 | agglomerative_instance.process(); |
||
| 73 | |||
| 74 | # obtain results of clustering |
||
| 75 | clusters = agglomerative_instance.get_clusters(); |
||
| 76 | @endcode |
||
| 77 | |||
| 78 | Algorithm performance can be improved if 'ccore' flag is on. In this case C++ library will be called for clustering. |
||
| 79 | There is example of clustering 'LSUN' sample when usage of single or complete link will take a lot of resources and |
||
| 80 | when core usage is prefereble. |
||
| 81 | @code |
||
| 82 | # sample Lsun for cluster analysis |
||
| 83 | lsun_sample = read_sample(FCPS_SAMPLES.SAMPLE_LSUN); |
||
| 84 | |||
| 85 | # create instance of the algorithm that will use ccore library (the last argument) |
||
| 86 | agglomerative_instance = agglomerative(lsun_sample, 3, link_type.SINGLE_LINK, True); |
||
| 87 | |||
| 88 | # start processing |
||
| 89 | agglomerative_instance.process(); |
||
| 90 | |||
| 91 | # get result and visualize it |
||
| 92 | lsun_clusters = agglomerative_instance.get_clusters(); |
||
| 93 | visualizer = cluster_visualizer(); |
||
| 94 | visualizer.append_clusters(lsun_clusters, lsun_sample); |
||
| 95 | visualizer.show(); |
||
| 96 | @endcode |
||
| 97 | |||
| 98 | Example of agglomerative clustering using different links: |
||
| 99 | @image html agglomerative_lsun_clustering_single_link.png |
||
| 100 | |||
| 101 | """ |
||
| 102 | |||
| 103 | def __init__(self, data, number_clusters, link = None, ccore = False): |
||
| 104 | """! |
||
| 105 | @brief Constructor of agglomerative hierarchical algorithm. |
||
| 106 | |||
| 107 | @param[in] data (list): Input data that is presented as a list of points (objects), each point should be represented by list, for example |
||
| 108 | [[0.1, 0.2], [0.4, 0.5], [1.3, 0.9]]. |
||
| 109 | @param[in] number_clusters (uint): Number of clusters that should be allocated. |
||
| 110 | @param[in] link (type_link): Link type that is used for calculation similarity between objects and clusters, if it is not specified centroid link will be used by default. |
||
| 111 | @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not (by default it is 'False'). |
||
| 112 | |||
| 113 | """ |
||
| 114 | |||
| 115 | self.__pointer_data = data; |
||
| 116 | self.__number_clusters = number_clusters; |
||
| 117 | self.__similarity = link; |
||
| 118 | |||
| 119 | if (self.__similarity is None): |
||
| 120 | self.__similarity = type_link.CENTROID_LINK; |
||
| 121 | |||
| 122 | self.__clusters = []; |
||
| 123 | self.__ccore = ccore; |
||
| 124 | |||
| 125 | if (self.__similarity == type_link.CENTROID_LINK): |
||
| 126 | self.__centers = self.__pointer_data.copy(); # used in case of usage of centroid links |
||
| 127 | |||
| 128 | |||
| 129 | def process(self): |
||
| 130 | """! |
||
| 131 | @brief Performs cluster analysis in line with rules of agglomerative algorithm and similarity. |
||
| 132 | |||
| 133 | @see get_clusters() |
||
| 134 | |||
| 135 | """ |
||
| 136 | |||
| 137 | if (self.__ccore is True): |
||
| 138 | self.__clusters = wrapper.agglomerative_algorithm(self.__pointer_data, self.__number_clusters, self.__similarity); |
||
| 139 | |||
| 140 | else: |
||
| 141 | self.__clusters = [[index] for index in range(0, len(self.__pointer_data))]; |
||
| 142 | |||
| 143 | current_number_clusters = len(self.__clusters); |
||
| 144 | |||
| 145 | while (current_number_clusters > self.__number_clusters): |
||
| 146 | self.__merge_similar_clusters(); |
||
| 147 | current_number_clusters = len(self.__clusters); |
||
| 148 | |||
| 149 | |||
| 150 | def get_clusters(self): |
||
| 151 | """! |
||
| 152 | @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data. |
||
| 153 | |||
| 154 | @remark Results of clustering can be obtained using corresponding gets methods. |
||
| 155 | |||
| 156 | @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data. |
||
| 157 | |||
| 158 | @see process() |
||
| 159 | |||
| 160 | """ |
||
| 161 | |||
| 162 | return self.__clusters; |
||
| 163 | |||
| 164 | |||
| 165 | def get_cluster_encoding(self): |
||
| 166 | """! |
||
| 167 | @brief Returns clustering result representation type that indicate how clusters are encoded. |
||
| 168 | |||
| 169 | @return (type_encoding) Clustering result representation. |
||
| 170 | |||
| 171 | @see get_clusters() |
||
| 172 | |||
| 173 | """ |
||
| 174 | |||
| 175 | return type_encoding.CLUSTER_INDEX_LIST_SEPARATION; |
||
| 176 | |||
| 177 | |||
| 178 | def __merge_similar_clusters(self): |
||
| 179 | """! |
||
| 180 | @brief Merges the most similar clusters in line with link type. |
||
| 181 | |||
| 182 | """ |
||
| 183 | |||
| 184 | if (self.__similarity == type_link.AVERAGE_LINK): |
||
| 185 | self.__merge_by_average_link(); |
||
| 186 | |||
| 187 | elif (self.__similarity == type_link.CENTROID_LINK): |
||
| 188 | self.__merge_by_centroid_link(); |
||
| 189 | |||
| 190 | elif (self.__similarity == type_link.COMPLETE_LINK): |
||
| 191 | self.__merge_by_complete_link(); |
||
| 192 | |||
| 193 | elif (self.__similarity == type_link.SINGLE_LINK): |
||
| 194 | self.__merge_by_signle_link(); |
||
| 195 | |||
| 196 | else: |
||
| 197 | raise NameError('Not supported similarity is used'); |
||
| 198 | |||
| 199 | |||
| 200 | def __merge_by_average_link(self): |
||
| 201 | """! |
||
| 202 | @brief Merges the most similar clusters in line with average link type. |
||
| 203 | |||
| 204 | """ |
||
| 205 | |||
| 206 | minimum_average_distance = float('Inf'); |
||
| 207 | |||
| 208 | View Code Duplication | for index_cluster1 in range(0, len(self.__clusters)): |
|
|
|
|||
| 209 | for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)): |
||
| 210 | |||
| 211 | # Find farthest objects |
||
| 212 | candidate_average_distance = 0.0; |
||
| 213 | for index_object1 in self.__clusters[index_cluster1]: |
||
| 214 | for index_object2 in self.__clusters[index_cluster2]: |
||
| 215 | candidate_average_distance += euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]); |
||
| 216 | |||
| 217 | candidate_average_distance /= (len(self.__clusters[index_cluster1]) + len(self.__clusters[index_cluster2])); |
||
| 218 | |||
| 219 | if (candidate_average_distance < minimum_average_distance): |
||
| 220 | minimum_average_distance = candidate_average_distance; |
||
| 221 | indexes = [index_cluster1, index_cluster2]; |
||
| 222 | |||
| 223 | self.__clusters[indexes[0]] += self.__clusters[indexes[1]]; |
||
| 224 | self.__clusters.pop(indexes[1]); # remove merged cluster. |
||
| 225 | |||
| 226 | |||
| 227 | def __merge_by_centroid_link(self): |
||
| 228 | """! |
||
| 229 | @brief Merges the most similar clusters in line with centroid link type. |
||
| 230 | |||
| 231 | """ |
||
| 232 | |||
| 233 | minimum_centroid_distance = float('Inf'); |
||
| 234 | indexes = None; |
||
| 235 | |||
| 236 | for index1 in range(0, len(self.__centers)): |
||
| 237 | for index2 in range(index1 + 1, len(self.__centers)): |
||
| 238 | distance = euclidean_distance_sqrt(self.__centers[index1], self.__centers[index2]); |
||
| 239 | if (distance < minimum_centroid_distance): |
||
| 240 | minimum_centroid_distance = distance; |
||
| 241 | indexes = [index1, index2]; |
||
| 242 | |||
| 243 | self.__clusters[indexes[0]] += self.__clusters[indexes[1]]; |
||
| 244 | self.__centers[indexes[0]] = self.__calculate_center(self.__clusters[indexes[0]]); |
||
| 245 | |||
| 246 | self.__clusters.pop(indexes[1]); # remove merged cluster. |
||
| 247 | self.__centers.pop(indexes[1]); # remove merged center. |
||
| 248 | |||
| 249 | |||
| 250 | View Code Duplication | def __merge_by_complete_link(self): |
|
| 251 | """! |
||
| 252 | @brief Merges the most similar clusters in line with complete link type. |
||
| 253 | |||
| 254 | """ |
||
| 255 | |||
| 256 | minimum_complete_distance = float('Inf'); |
||
| 257 | indexes = None; |
||
| 258 | |||
| 259 | for index_cluster1 in range(0, len(self.__clusters)): |
||
| 260 | for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)): |
||
| 261 | candidate_maximum_distance = self.__calculate_farthest_distance(index_cluster1, index_cluster2); |
||
| 262 | |||
| 263 | if (candidate_maximum_distance < minimum_complete_distance): |
||
| 264 | minimum_complete_distance = candidate_maximum_distance; |
||
| 265 | indexes = [index_cluster1, index_cluster2]; |
||
| 266 | |||
| 267 | self.__clusters[indexes[0]] += self.__clusters[indexes[1]]; |
||
| 268 | self.__clusters.pop(indexes[1]); # remove merged cluster. |
||
| 269 | |||
| 270 | |||
| 271 | def __calculate_farthest_distance(self, index_cluster1, index_cluster2): |
||
| 272 | """! |
||
| 273 | @brief Finds two farthest objects in two specified clusters in terms and returns distance between them. |
||
| 274 | |||
| 275 | @param[in] (uint) Index of the first cluster. |
||
| 276 | @param[in] (uint) Index of the second cluster. |
||
| 277 | |||
| 278 | @return The farthest euclidean distance between two clusters. |
||
| 279 | |||
| 280 | """ |
||
| 281 | candidate_maximum_distance = 0.0; |
||
| 282 | for index_object1 in self.__clusters[index_cluster1]: |
||
| 283 | for index_object2 in self.__clusters[index_cluster2]: |
||
| 284 | distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]); |
||
| 285 | |||
| 286 | if (distance > candidate_maximum_distance): |
||
| 287 | candidate_maximum_distance = distance; |
||
| 288 | |||
| 289 | return candidate_maximum_distance; |
||
| 290 | |||
| 291 | |||
| 292 | def __merge_by_signle_link(self): |
||
| 293 | """! |
||
| 294 | @brief Merges the most similar clusters in line with single link type. |
||
| 295 | |||
| 296 | """ |
||
| 297 | |||
| 298 | minimum_single_distance = float('Inf'); |
||
| 299 | indexes = None; |
||
| 300 | |||
| 301 | for index_cluster1 in range(0, len(self.__clusters)): |
||
| 302 | for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)): |
||
| 303 | candidate_minimum_distance = self.__calculate_nearest_distance(index_cluster1, index_cluster2); |
||
| 304 | |||
| 305 | if (candidate_minimum_distance < minimum_single_distance): |
||
| 306 | minimum_single_distance = candidate_minimum_distance; |
||
| 307 | indexes = [index_cluster1, index_cluster2]; |
||
| 308 | |||
| 309 | self.__clusters[indexes[0]] += self.__clusters[indexes[1]]; |
||
| 310 | self.__clusters.pop(indexes[1]); # remove merged cluster. |
||
| 311 | |||
| 312 | |||
| 313 | def __calculate_nearest_distance(self, index_cluster1, index_cluster2): |
||
| 314 | """! |
||
| 315 | @brief Finds two nearest objects in two specified clusters and returns distance between them. |
||
| 316 | |||
| 317 | @param[in] (uint) Index of the first cluster. |
||
| 318 | @param[in] (uint) Index of the second cluster. |
||
| 319 | |||
| 320 | @return The nearest euclidean distance between two clusters. |
||
| 321 | |||
| 322 | """ |
||
| 323 | candidate_minimum_distance = float('Inf'); |
||
| 324 | |||
| 325 | for index_object1 in self.__clusters[index_cluster1]: |
||
| 326 | for index_object2 in self.__clusters[index_cluster2]: |
||
| 327 | distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]); |
||
| 328 | if (distance < candidate_minimum_distance): |
||
| 329 | candidate_minimum_distance = distance; |
||
| 330 | |||
| 331 | return candidate_minimum_distance; |
||
| 332 | |||
| 333 | |||
| 334 | def __calculate_center(self, cluster): |
||
| 335 | """! |
||
| 336 | @brief Calculates new center. |
||
| 337 | |||
| 338 | @return (list) New value of the center of the specified cluster. |
||
| 339 | |||
| 340 | """ |
||
| 341 | |||
| 342 | dimension = len(self.__pointer_data[cluster[0]]); |
||
| 343 | center = [0] * dimension; |
||
| 344 | for index_point in cluster: |
||
| 345 | for index_dimension in range(0, dimension): |
||
| 346 | center[index_dimension] += self.__pointer_data[index_point][index_dimension]; |
||
| 347 | |||
| 348 | for index_dimension in range(0, dimension): |
||
| 349 | center[index_dimension] /= len(cluster); |
||
| 350 | |||
| 351 | return center; |