| Total Complexity | 43 |
| Total Lines | 256 |
| Duplicated Lines | 14.84 % |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like agglomerative often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | """! |
||
| 54 | class agglomerative: |
||
| 55 | """! |
||
| 56 | @brief Class represents agglomerative algorithm for cluster analysis. |
||
| 57 | |||
| 58 | Example: |
||
| 59 | @code |
||
| 60 | # sample for cluster analysis (represented by list) |
||
| 61 | sample = read_sample(path_to_sample); |
||
| 62 | |||
| 63 | # create object that uses python code only |
||
| 64 | agglomerative_instance = agglomerative(sample, 2, link_type.CENTROID_LINK) |
||
| 65 | |||
| 66 | # cluster analysis |
||
| 67 | agglomerative_instance.process(); |
||
| 68 | |||
| 69 | # obtain results of clustering |
||
| 70 | clusters = agglomerative_instance.get_clusters(); |
||
| 71 | @endcode |
||
| 72 | |||
| 73 | """ |
||
| 74 | |||
| 75 | def __init__(self, data, number_clusters, link = None, ccore = False): |
||
| 76 | """! |
||
| 77 | @brief Constructor of agglomerative hierarchical algorithm. |
||
| 78 | |||
| 79 | @param[in] data (list): Input data that is presented as a list of points (objects), each point should be represented by a list or tuple. |
||
| 80 | @param[in] number_clusters (uint): Number of clusters that should be allocated. |
||
| 81 | @param[in] link (type_link): Link type that is used for calculation similarity between objects and clusters, if it is not specified centroid link will be used by default. |
||
| 82 | @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not. |
||
| 83 | |||
| 84 | """ |
||
| 85 | |||
| 86 | self.__pointer_data = data; |
||
| 87 | self.__number_clusters = number_clusters; |
||
| 88 | self.__similarity = link; |
||
| 89 | |||
| 90 | if (self.__similarity is None): |
||
| 91 | self.__similarity = type_link.CENTROID_LINK; |
||
| 92 | |||
| 93 | self.__clusters = []; |
||
| 94 | self.__ccore = ccore; |
||
| 95 | |||
| 96 | if (self.__similarity == type_link.CENTROID_LINK): |
||
| 97 | self.__centers = self.__pointer_data.copy(); # used in case of usage of centroid links |
||
| 98 | |||
| 99 | |||
| 100 | def process(self): |
||
| 101 | """! |
||
| 102 | @brief Performs cluster analysis in line with rules of agglomerative algorithm and similarity. |
||
| 103 | |||
| 104 | @see get_clusters() |
||
| 105 | |||
| 106 | """ |
||
| 107 | |||
| 108 | if (self.__ccore is True): |
||
| 109 | self.__clusters = wrapper.agglomerative_algorithm(self.__pointer_data, self.__number_clusters, self.__similarity); |
||
| 110 | |||
| 111 | else: |
||
| 112 | self.__clusters = [[index] for index in range(0, len(self.__pointer_data))]; |
||
| 113 | |||
| 114 | current_number_clusters = len(self.__clusters); |
||
| 115 | |||
| 116 | while (current_number_clusters > self.__number_clusters): |
||
| 117 | self.__merge_similar_clusters(); |
||
| 118 | current_number_clusters = len(self.__clusters); |
||
| 119 | |||
| 120 | |||
| 121 | def get_clusters(self): |
||
| 122 | """! |
||
| 123 | @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data. |
||
| 124 | |||
| 125 | @remark Results of clustering can be obtained using corresponding gets methods. |
||
| 126 | |||
| 127 | @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data. |
||
| 128 | |||
| 129 | @see process() |
||
| 130 | |||
| 131 | """ |
||
| 132 | |||
| 133 | return self.__clusters; |
||
| 134 | |||
| 135 | |||
| 136 | def __merge_similar_clusters(self): |
||
| 137 | """! |
||
| 138 | @brief Merges the most similar clusters in line with link type. |
||
| 139 | |||
| 140 | """ |
||
| 141 | |||
| 142 | if (self.__similarity == type_link.AVERAGE_LINK): |
||
| 143 | self.__merge_by_average_link(); |
||
| 144 | |||
| 145 | elif (self.__similarity == type_link.CENTROID_LINK): |
||
| 146 | self.__merge_by_centroid_link(); |
||
| 147 | |||
| 148 | elif (self.__similarity == type_link.COMPLETE_LINK): |
||
| 149 | self.__merge_by_complete_link(); |
||
| 150 | |||
| 151 | elif (self.__similarity == type_link.SINGLE_LINK): |
||
| 152 | self.__merge_by_signle_link(); |
||
| 153 | |||
| 154 | else: |
||
| 155 | raise NameError('Not supported similarity is used'); |
||
| 156 | |||
| 157 | |||
| 158 | def __merge_by_average_link(self): |
||
| 159 | """! |
||
| 160 | @brief Merges the most similar clusters in line with average link type. |
||
| 161 | |||
| 162 | """ |
||
| 163 | |||
| 164 | minimum_average_distance = float('Inf'); |
||
| 165 | |||
| 166 | for index_cluster1 in range(0, len(self.__clusters)): |
||
| 167 | for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)): |
||
| 168 | |||
| 169 | # Find farthest objects |
||
| 170 | candidate_average_distance = 0.0; |
||
| 171 | for index_object1 in self.__clusters[index_cluster1]: |
||
| 172 | for index_object2 in self.__clusters[index_cluster2]: |
||
| 173 | candidate_average_distance += euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]); |
||
| 174 | |||
| 175 | candidate_average_distance /= (len(self.__clusters[index_cluster1]) + len(self.__clusters[index_cluster2])); |
||
| 176 | |||
| 177 | if (candidate_average_distance < minimum_average_distance): |
||
| 178 | minimum_average_distance = candidate_average_distance; |
||
| 179 | indexes = [index_cluster1, index_cluster2]; |
||
| 180 | |||
| 181 | self.__clusters[indexes[0]] += self.__clusters[indexes[1]]; |
||
| 182 | self.__clusters.pop(indexes[1]); # remove merged cluster. |
||
| 183 | |||
| 184 | |||
| 185 | def __merge_by_centroid_link(self): |
||
| 186 | """! |
||
| 187 | @brief Merges the most similar clusters in line with centroid link type. |
||
| 188 | |||
| 189 | """ |
||
| 190 | |||
| 191 | minimum_centroid_distance = float('Inf'); |
||
| 192 | indexes = None; |
||
| 193 | |||
| 194 | for index1 in range(0, len(self.__centers)): |
||
| 195 | for index2 in range(index1 + 1, len(self.__centers)): |
||
| 196 | distance = euclidean_distance_sqrt(self.__centers[index1], self.__centers[index2]); |
||
| 197 | if (distance < minimum_centroid_distance): |
||
| 198 | minimum_centroid_distance = distance; |
||
| 199 | indexes = [index1, index2]; |
||
| 200 | |||
| 201 | self.__clusters[indexes[0]] += self.__clusters[indexes[1]]; |
||
| 202 | self.__centers[indexes[0]] = self.__calculate_center(self.__clusters[indexes[0]]); |
||
| 203 | |||
| 204 | self.__clusters.pop(indexes[1]); # remove merged cluster. |
||
| 205 | self.__centers.pop(indexes[1]); # remove merged center. |
||
| 206 | |||
| 207 | |||
| 208 | View Code Duplication | def __merge_by_complete_link(self): |
|
|
|
|||
| 209 | """! |
||
| 210 | @brief Merges the most similar clusters in line with complete link type. |
||
| 211 | |||
| 212 | """ |
||
| 213 | |||
| 214 | minimum_complete_distance = float('Inf'); |
||
| 215 | indexes = None; |
||
| 216 | |||
| 217 | for index_cluster1 in range(0, len(self.__clusters)): |
||
| 218 | for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)): |
||
| 219 | candidate_maximum_distance = self.__calculate_farthest_distance(index_cluster1, index_cluster2); |
||
| 220 | |||
| 221 | if (candidate_maximum_distance < minimum_complete_distance): |
||
| 222 | minimum_complete_distance = candidate_maximum_distance; |
||
| 223 | indexes = [index_cluster1, index_cluster2]; |
||
| 224 | |||
| 225 | self.__clusters[indexes[0]] += self.__clusters[indexes[1]]; |
||
| 226 | self.__clusters.pop(indexes[1]); # remove merged cluster. |
||
| 227 | |||
| 228 | |||
| 229 | def __calculate_farthest_distance(self, index_cluster1, index_cluster2): |
||
| 230 | """! |
||
| 231 | @brief Finds two farthest objects in two specified clusters in terms and returns distance between them. |
||
| 232 | |||
| 233 | @param[in] (uint) Index of the first cluster. |
||
| 234 | @param[in] (uint) Index of the second cluster. |
||
| 235 | |||
| 236 | @return The farthest euclidean distance between two clusters. |
||
| 237 | |||
| 238 | """ |
||
| 239 | candidate_maximum_distance = 0.0; |
||
| 240 | for index_object1 in self.__clusters[index_cluster1]: |
||
| 241 | for index_object2 in self.__clusters[index_cluster2]: |
||
| 242 | distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]); |
||
| 243 | |||
| 244 | if (distance > candidate_maximum_distance): |
||
| 245 | candidate_maximum_distance = distance; |
||
| 246 | |||
| 247 | return candidate_maximum_distance; |
||
| 248 | |||
| 249 | |||
| 250 | View Code Duplication | def __merge_by_signle_link(self): |
|
| 251 | """! |
||
| 252 | @brief Merges the most similar clusters in line with single link type. |
||
| 253 | |||
| 254 | """ |
||
| 255 | |||
| 256 | minimum_single_distance = float('Inf'); |
||
| 257 | indexes = None; |
||
| 258 | |||
| 259 | for index_cluster1 in range(0, len(self.__clusters)): |
||
| 260 | for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)): |
||
| 261 | candidate_minimum_distance = self.__calculate_nearest_distance(index_cluster1, index_cluster2); |
||
| 262 | |||
| 263 | if (candidate_minimum_distance < minimum_single_distance): |
||
| 264 | minimum_single_distance = candidate_minimum_distance; |
||
| 265 | indexes = [index_cluster1, index_cluster2]; |
||
| 266 | |||
| 267 | self.__clusters[indexes[0]] += self.__clusters[indexes[1]]; |
||
| 268 | self.__clusters.pop(indexes[1]); # remove merged cluster. |
||
| 269 | |||
| 270 | |||
| 271 | def __calculate_nearest_distance(self, index_cluster1, index_cluster2): |
||
| 272 | """! |
||
| 273 | @brief Finds two nearest objects in two specified clusters and returns distance between them. |
||
| 274 | |||
| 275 | @param[in] (uint) Index of the first cluster. |
||
| 276 | @param[in] (uint) Index of the second cluster. |
||
| 277 | |||
| 278 | @return The nearest euclidean distance between two clusters. |
||
| 279 | |||
| 280 | """ |
||
| 281 | candidate_minimum_distance = float('Inf'); |
||
| 282 | |||
| 283 | for index_object1 in self.__clusters[index_cluster1]: |
||
| 284 | for index_object2 in self.__clusters[index_cluster2]: |
||
| 285 | distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]); |
||
| 286 | if (distance < candidate_minimum_distance): |
||
| 287 | candidate_minimum_distance = distance; |
||
| 288 | |||
| 289 | return candidate_minimum_distance; |
||
| 290 | |||
| 291 | |||
| 292 | def __calculate_center(self, cluster): |
||
| 293 | """! |
||
| 294 | @brief Calculates new center. |
||
| 295 | |||
| 296 | @return (list) New value of the center of the specified cluster. |
||
| 297 | |||
| 298 | """ |
||
| 299 | |||
| 300 | dimension = len(self.__pointer_data[cluster[0]]); |
||
| 301 | center = [0] * dimension; |
||
| 302 | for index_point in cluster: |
||
| 303 | for index_dimension in range(0, dimension): |
||
| 304 | center[index_dimension] += self.__pointer_data[index_point][index_dimension]; |
||
| 305 | |||
| 306 | for index_dimension in range(0, dimension): |
||
| 307 | center[index_dimension] /= len(cluster); |
||
| 308 | |||
| 309 | return center; |