| Total Complexity | 44 |
| Total Lines | 269 |
| Duplicated Lines | 14.13 % |
| Changes | 0 | ||
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like agglomerative often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | """! |
||
| 57 | class agglomerative: |
||
| 58 | """! |
||
| 59 | @brief Class represents agglomerative algorithm for cluster analysis. |
||
| 60 | |||
| 61 | Example: |
||
| 62 | @code |
||
| 63 | # sample for cluster analysis (represented by list) |
||
| 64 | sample = read_sample(path_to_sample); |
||
| 65 | |||
| 66 | # create object that uses python code only |
||
| 67 | agglomerative_instance = agglomerative(sample, 2, link_type.CENTROID_LINK) |
||
| 68 | |||
| 69 | # cluster analysis |
||
| 70 | agglomerative_instance.process(); |
||
| 71 | |||
| 72 | # obtain results of clustering |
||
| 73 | clusters = agglomerative_instance.get_clusters(); |
||
| 74 | @endcode |
||
| 75 | |||
| 76 | """ |
||
| 77 | |||
| 78 | def __init__(self, data, number_clusters, link = None, ccore = False): |
||
| 79 | """! |
||
| 80 | @brief Constructor of agglomerative hierarchical algorithm. |
||
| 81 | |||
| 82 | @param[in] data (list): Input data that is presented as a list of points (objects), each point should be represented by a list or tuple. |
||
| 83 | @param[in] number_clusters (uint): Number of clusters that should be allocated. |
||
| 84 | @param[in] link (type_link): Link type that is used for calculation similarity between objects and clusters, if it is not specified centroid link will be used by default. |
||
| 85 | @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not. |
||
| 86 | |||
| 87 | """ |
||
| 88 | |||
| 89 | self.__pointer_data = data; |
||
| 90 | self.__number_clusters = number_clusters; |
||
| 91 | self.__similarity = link; |
||
| 92 | |||
| 93 | if (self.__similarity is None): |
||
| 94 | self.__similarity = type_link.CENTROID_LINK; |
||
| 95 | |||
| 96 | self.__clusters = []; |
||
| 97 | self.__ccore = ccore; |
||
| 98 | |||
| 99 | if (self.__similarity == type_link.CENTROID_LINK): |
||
| 100 | self.__centers = self.__pointer_data.copy(); # used in case of usage of centroid links |
||
| 101 | |||
| 102 | |||
| 103 | def process(self): |
||
| 104 | """! |
||
| 105 | @brief Performs cluster analysis in line with rules of agglomerative algorithm and similarity. |
||
| 106 | |||
| 107 | @see get_clusters() |
||
| 108 | |||
| 109 | """ |
||
| 110 | |||
| 111 | if (self.__ccore is True): |
||
| 112 | self.__clusters = wrapper.agglomerative_algorithm(self.__pointer_data, self.__number_clusters, self.__similarity); |
||
| 113 | |||
| 114 | else: |
||
| 115 | self.__clusters = [[index] for index in range(0, len(self.__pointer_data))]; |
||
| 116 | |||
| 117 | current_number_clusters = len(self.__clusters); |
||
| 118 | |||
| 119 | while (current_number_clusters > self.__number_clusters): |
||
| 120 | self.__merge_similar_clusters(); |
||
| 121 | current_number_clusters = len(self.__clusters); |
||
| 122 | |||
| 123 | |||
| 124 | def get_clusters(self): |
||
| 125 | """! |
||
| 126 | @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data. |
||
| 127 | |||
| 128 | @remark Results of clustering can be obtained using corresponding gets methods. |
||
| 129 | |||
| 130 | @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data. |
||
| 131 | |||
| 132 | @see process() |
||
| 133 | |||
| 134 | """ |
||
| 135 | |||
| 136 | return self.__clusters; |
||
| 137 | |||
| 138 | |||
| 139 | def get_cluster_encoding(self): |
||
| 140 | """! |
||
| 141 | @brief Returns clustering result representation type that indicate how clusters are encoded. |
||
| 142 | |||
| 143 | @return (type_encoding) Clustering result representation. |
||
| 144 | |||
| 145 | @see get_clusters() |
||
| 146 | |||
| 147 | """ |
||
| 148 | |||
| 149 | return type_encoding.CLUSTER_INDEX_LIST_SEPARATION; |
||
| 150 | |||
| 151 | |||
| 152 | def __merge_similar_clusters(self): |
||
| 153 | """! |
||
| 154 | @brief Merges the most similar clusters in line with link type. |
||
| 155 | |||
| 156 | """ |
||
| 157 | |||
| 158 | if (self.__similarity == type_link.AVERAGE_LINK): |
||
| 159 | self.__merge_by_average_link(); |
||
| 160 | |||
| 161 | elif (self.__similarity == type_link.CENTROID_LINK): |
||
| 162 | self.__merge_by_centroid_link(); |
||
| 163 | |||
| 164 | elif (self.__similarity == type_link.COMPLETE_LINK): |
||
| 165 | self.__merge_by_complete_link(); |
||
| 166 | |||
| 167 | elif (self.__similarity == type_link.SINGLE_LINK): |
||
| 168 | self.__merge_by_signle_link(); |
||
| 169 | |||
| 170 | else: |
||
| 171 | raise NameError('Not supported similarity is used'); |
||
| 172 | |||
| 173 | |||
| 174 | def __merge_by_average_link(self): |
||
| 175 | """! |
||
| 176 | @brief Merges the most similar clusters in line with average link type. |
||
| 177 | |||
| 178 | """ |
||
| 179 | |||
| 180 | minimum_average_distance = float('Inf'); |
||
| 181 | |||
| 182 | for index_cluster1 in range(0, len(self.__clusters)): |
||
| 183 | for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)): |
||
| 184 | |||
| 185 | # Find farthest objects |
||
| 186 | candidate_average_distance = 0.0; |
||
| 187 | for index_object1 in self.__clusters[index_cluster1]: |
||
| 188 | for index_object2 in self.__clusters[index_cluster2]: |
||
| 189 | candidate_average_distance += euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]); |
||
| 190 | |||
| 191 | candidate_average_distance /= (len(self.__clusters[index_cluster1]) + len(self.__clusters[index_cluster2])); |
||
| 192 | |||
| 193 | if (candidate_average_distance < minimum_average_distance): |
||
| 194 | minimum_average_distance = candidate_average_distance; |
||
| 195 | indexes = [index_cluster1, index_cluster2]; |
||
| 196 | |||
| 197 | self.__clusters[indexes[0]] += self.__clusters[indexes[1]]; |
||
| 198 | self.__clusters.pop(indexes[1]); # remove merged cluster. |
||
| 199 | |||
| 200 | |||
| 201 | def __merge_by_centroid_link(self): |
||
| 202 | """! |
||
| 203 | @brief Merges the most similar clusters in line with centroid link type. |
||
| 204 | |||
| 205 | """ |
||
| 206 | |||
| 207 | minimum_centroid_distance = float('Inf'); |
||
| 208 | View Code Duplication | indexes = None; |
|
|
|
|||
| 209 | |||
| 210 | for index1 in range(0, len(self.__centers)): |
||
| 211 | for index2 in range(index1 + 1, len(self.__centers)): |
||
| 212 | distance = euclidean_distance_sqrt(self.__centers[index1], self.__centers[index2]); |
||
| 213 | if (distance < minimum_centroid_distance): |
||
| 214 | minimum_centroid_distance = distance; |
||
| 215 | indexes = [index1, index2]; |
||
| 216 | |||
| 217 | self.__clusters[indexes[0]] += self.__clusters[indexes[1]]; |
||
| 218 | self.__centers[indexes[0]] = self.__calculate_center(self.__clusters[indexes[0]]); |
||
| 219 | |||
| 220 | self.__clusters.pop(indexes[1]); # remove merged cluster. |
||
| 221 | self.__centers.pop(indexes[1]); # remove merged center. |
||
| 222 | |||
| 223 | |||
| 224 | def __merge_by_complete_link(self): |
||
| 225 | """! |
||
| 226 | @brief Merges the most similar clusters in line with complete link type. |
||
| 227 | |||
| 228 | """ |
||
| 229 | |||
| 230 | minimum_complete_distance = float('Inf'); |
||
| 231 | indexes = None; |
||
| 232 | |||
| 233 | for index_cluster1 in range(0, len(self.__clusters)): |
||
| 234 | for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)): |
||
| 235 | candidate_maximum_distance = self.__calculate_farthest_distance(index_cluster1, index_cluster2); |
||
| 236 | |||
| 237 | if (candidate_maximum_distance < minimum_complete_distance): |
||
| 238 | minimum_complete_distance = candidate_maximum_distance; |
||
| 239 | indexes = [index_cluster1, index_cluster2]; |
||
| 240 | |||
| 241 | self.__clusters[indexes[0]] += self.__clusters[indexes[1]]; |
||
| 242 | self.__clusters.pop(indexes[1]); # remove merged cluster. |
||
| 243 | |||
| 244 | |||
| 245 | def __calculate_farthest_distance(self, index_cluster1, index_cluster2): |
||
| 246 | """! |
||
| 247 | @brief Finds two farthest objects in two specified clusters in terms and returns distance between them. |
||
| 248 | |||
| 249 | @param[in] (uint) Index of the first cluster. |
||
| 250 | View Code Duplication | @param[in] (uint) Index of the second cluster. |
|
| 251 | |||
| 252 | @return The farthest euclidean distance between two clusters. |
||
| 253 | |||
| 254 | """ |
||
| 255 | candidate_maximum_distance = 0.0; |
||
| 256 | for index_object1 in self.__clusters[index_cluster1]: |
||
| 257 | for index_object2 in self.__clusters[index_cluster2]: |
||
| 258 | distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]); |
||
| 259 | |||
| 260 | if (distance > candidate_maximum_distance): |
||
| 261 | candidate_maximum_distance = distance; |
||
| 262 | |||
| 263 | return candidate_maximum_distance; |
||
| 264 | |||
| 265 | |||
| 266 | def __merge_by_signle_link(self): |
||
| 267 | """! |
||
| 268 | @brief Merges the most similar clusters in line with single link type. |
||
| 269 | |||
| 270 | """ |
||
| 271 | |||
| 272 | minimum_single_distance = float('Inf'); |
||
| 273 | indexes = None; |
||
| 274 | |||
| 275 | for index_cluster1 in range(0, len(self.__clusters)): |
||
| 276 | for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)): |
||
| 277 | candidate_minimum_distance = self.__calculate_nearest_distance(index_cluster1, index_cluster2); |
||
| 278 | |||
| 279 | if (candidate_minimum_distance < minimum_single_distance): |
||
| 280 | minimum_single_distance = candidate_minimum_distance; |
||
| 281 | indexes = [index_cluster1, index_cluster2]; |
||
| 282 | |||
| 283 | self.__clusters[indexes[0]] += self.__clusters[indexes[1]]; |
||
| 284 | self.__clusters.pop(indexes[1]); # remove merged cluster. |
||
| 285 | |||
| 286 | |||
| 287 | def __calculate_nearest_distance(self, index_cluster1, index_cluster2): |
||
| 288 | """! |
||
| 289 | @brief Finds two nearest objects in two specified clusters and returns distance between them. |
||
| 290 | |||
| 291 | @param[in] (uint) Index of the first cluster. |
||
| 292 | @param[in] (uint) Index of the second cluster. |
||
| 293 | |||
| 294 | @return The nearest euclidean distance between two clusters. |
||
| 295 | |||
| 296 | """ |
||
| 297 | candidate_minimum_distance = float('Inf'); |
||
| 298 | |||
| 299 | for index_object1 in self.__clusters[index_cluster1]: |
||
| 300 | for index_object2 in self.__clusters[index_cluster2]: |
||
| 301 | distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]); |
||
| 302 | if (distance < candidate_minimum_distance): |
||
| 303 | candidate_minimum_distance = distance; |
||
| 304 | |||
| 305 | return candidate_minimum_distance; |
||
| 306 | |||
| 307 | |||
| 308 | def __calculate_center(self, cluster): |
||
| 309 | """! |
||
| 310 | @brief Calculates new center. |
||
| 311 | |||
| 312 | @return (list) New value of the center of the specified cluster. |
||
| 313 | |||
| 314 | """ |
||
| 315 | |||
| 316 | dimension = len(self.__pointer_data[cluster[0]]); |
||
| 317 | center = [0] * dimension; |
||
| 318 | for index_point in cluster: |
||
| 319 | for index_dimension in range(0, dimension): |
||
| 320 | center[index_dimension] += self.__pointer_data[index_point][index_dimension]; |
||
| 321 | |||
| 322 | for index_dimension in range(0, dimension): |
||
| 323 | center[index_dimension] /= len(cluster); |
||
| 324 | |||
| 325 | return center; |