| Total Complexity | 58 |
| Total Lines | 383 |
| Duplicated Lines | 0 % |
Complex classes like cure often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | """! |
||
| 76 | class cure: |
||
| 77 | """! |
||
| 78 | @brief Class represents clustering algorithm CURE. |
||
| 79 | |||
| 80 | Example: |
||
| 81 | @code |
||
| 82 | # read data for clustering from some file |
||
| 83 | sample = read_sample(path_to_data); |
||
| 84 | |||
| 85 | # create instance of cure algorithm for cluster analysis |
||
| 86 | # request for allocation of two clusters. |
||
| 87 | cure_instance = cure(sample, 2, 5, 0.5, True); |
||
| 88 | |||
| 89 | # run cluster analysis |
||
| 90 | cure_instance.process(); |
||
| 91 | |||
| 92 | # get results of clustering |
||
| 93 | clusters = cure_instance.get_clusters(); |
||
| 94 | @endcode |
||
| 95 | |||
| 96 | """ |
||
| 97 | |||
| 98 | def __init__(self, data, number_cluster, number_represent_points = 5, compression = 0.5, ccore = False): |
||
| 99 | """! |
||
| 100 | @brief Constructor of clustering algorithm CURE. |
||
| 101 | |||
| 102 | @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple. |
||
| 103 | @param[in] number_cluster (uint): Number of clusters that should be allocated. |
||
| 104 | @param[in] number_represent_points (uint): Number of representative points for each cluster. |
||
| 105 | @param[in] compression (double): Coefficient defines level of shrinking of representation points toward the mean of the new created cluster after merging on each step. Usually it destributed from 0 to 1. |
||
| 106 | @param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving. |
||
| 107 | |||
| 108 | """ |
||
| 109 | |||
| 110 | self.__pointer_data = data; |
||
| 111 | |||
| 112 | self.__clusters = None; |
||
| 113 | self.__representors = None; |
||
| 114 | self.__means = None; |
||
| 115 | |||
| 116 | self.__number_cluster = number_cluster; |
||
| 117 | self.__number_represent_points = number_represent_points; |
||
| 118 | self.__compression = compression; |
||
| 119 | |||
| 120 | self.__ccore = ccore; |
||
| 121 | |||
| 122 | if (ccore is False): |
||
| 123 | self.__create_queue(); # queue |
||
| 124 | self.__create_kdtree(); # create k-d tree |
||
| 125 | |||
| 126 | |||
| 127 | def process(self): |
||
| 128 | """! |
||
| 129 | @brief Performs cluster analysis in line with rules of CURE algorithm. |
||
| 130 | |||
| 131 | @remark Results of clustering can be obtained using corresponding get methods. |
||
| 132 | |||
| 133 | @see get_clusters() |
||
| 134 | |||
| 135 | """ |
||
| 136 | |||
| 137 | if (self.__ccore is True): |
||
| 138 | cure_data_pointer = wrapper.cure_algorithm(self.__pointer_data, self.__number_cluster, self.__number_represent_points, self.__compression); |
||
| 139 | |||
| 140 | self.__clusters = wrapper.cure_get_clusters(cure_data_pointer); |
||
| 141 | self.__representors = wrapper.cure_get_representors(cure_data_pointer); |
||
| 142 | self.__means = wrapper.cure_get_means(cure_data_pointer); |
||
| 143 | |||
| 144 | wrapper.cure_data_destroy(cure_data_pointer); |
||
| 145 | |||
| 146 | else: |
||
| 147 | while (len(self.__queue) > self.__number_cluster): |
||
| 148 | cluster1 = self.__queue[0]; # cluster that has nearest neighbor. |
||
| 149 | cluster2 = cluster1.closest; # closest cluster. |
||
| 150 | |||
| 151 | #print("Merge decision: \n\t", cluster1, "\n\t", cluster2);
|
||
| 152 | |||
| 153 | self.__queue.remove(cluster1); |
||
| 154 | self.__queue.remove(cluster2); |
||
| 155 | |||
| 156 | self.__delete_represented_points(cluster1); |
||
| 157 | self.__delete_represented_points(cluster2); |
||
| 158 | |||
| 159 | merged_cluster = self.__merge_clusters(cluster1, cluster2); |
||
| 160 | |||
| 161 | self.__insert_represented_points(merged_cluster); |
||
| 162 | |||
| 163 | # Pointers to clusters that should be relocated is stored here. |
||
| 164 | cluster_relocation_requests = []; |
||
| 165 | |||
| 166 | # Check for the last cluster |
||
| 167 | if (len(self.__queue) > 0): |
||
| 168 | merged_cluster.closest = self.__queue[0]; # arbitrary cluster from queue |
||
| 169 | merged_cluster.distance = self.__cluster_distance(merged_cluster, merged_cluster.closest); |
||
| 170 | |||
| 171 | for item in self.__queue: |
||
| 172 | distance = self.__cluster_distance(merged_cluster, item); |
||
| 173 | # Check if distance between new cluster and current is the best than now. |
||
| 174 | if (distance < merged_cluster.distance): |
||
| 175 | merged_cluster.closest = item; |
||
| 176 | merged_cluster.distance = distance; |
||
| 177 | |||
| 178 | # Check if current cluster has removed neighbor. |
||
| 179 | if ( (item.closest is cluster1) or (item.closest is cluster2) ): |
||
| 180 | # If previous distance was less then distance to new cluster then nearest cluster should be found in the tree. |
||
| 181 | #print("Update: ", item);
|
||
| 182 | if (item.distance < distance): |
||
| 183 | (item.closest, item.distance) = self.__closest_cluster(item, distance); |
||
| 184 | |||
| 185 | # TODO: investigation of root cause is required. |
||
| 186 | # Itself and merged cluster should be always in list of neighbors in line with specified radius. |
||
| 187 | # But merged cluster may not be in list due to error calculation, therefore it should be added |
||
| 188 | # manually. |
||
| 189 | if (item.closest is None): |
||
| 190 | item.closest = merged_cluster; |
||
| 191 | item.distance = distance; |
||
| 192 | |||
| 193 | # Otherwise new cluster is nearest. |
||
| 194 | else: |
||
| 195 | item.closest = merged_cluster; |
||
| 196 | item.distance = distance; |
||
| 197 | |||
| 198 | cluster_relocation_requests.append(item); |
||
| 199 | elif (item.distance > distance): |
||
| 200 | item.closest = merged_cluster; |
||
| 201 | item.ditance = distance; |
||
| 202 | |||
| 203 | cluster_relocation_requests.append(item); |
||
| 204 | |||
| 205 | # New cluster and updated clusters should relocated in queue |
||
| 206 | self.__insert_cluster(merged_cluster); |
||
| 207 | for item in cluster_relocation_requests: |
||
| 208 | self.__relocate_cluster(item); |
||
| 209 | |||
| 210 | # Change cluster representation |
||
| 211 | self.__clusters = [ cure_cluster_unit.points for cure_cluster_unit in self.__queue ]; |
||
| 212 | self.__representors = [ cure_cluster_unit.rep for cure_cluster_unit in self.__queue ]; |
||
| 213 | self.__means = [ cure_cluster_unit.mean for cure_cluster_unit in self.__queue ]; |
||
| 214 | |||
| 215 | |||
| 216 | def get_clusters(self): |
||
| 217 | """! |
||
| 218 | @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data. |
||
| 219 | |||
| 220 | @return (list) List of allocated clusters. |
||
| 221 | |||
| 222 | @see process() |
||
| 223 | @see get_representors() |
||
| 224 | @see get_means() |
||
| 225 | |||
| 226 | """ |
||
| 227 | |||
| 228 | return self.__clusters; |
||
| 229 | |||
| 230 | |||
| 231 | def get_representors(self): |
||
| 232 | """! |
||
| 233 | @brief Returns list of representors of each cluster. |
||
| 234 | @details Cluster index should be used for navigation between lists of representors. |
||
| 235 | |||
| 236 | @return (list) List of representors of each cluster. |
||
| 237 | |||
| 238 | @see get_clusters() |
||
| 239 | @see get_means() |
||
| 240 | |||
| 241 | """ |
||
| 242 | |||
| 243 | return self.__representors; |
||
| 244 | |||
| 245 | |||
| 246 | def get_means(self): |
||
| 247 | """! |
||
| 248 | @brief Returns list of mean values of each cluster. |
||
| 249 | @details Cluster index should be used for navigation between mean values. |
||
| 250 | |||
| 251 | @return (list) List of mean values of each cluster. |
||
| 252 | |||
| 253 | @see get_clusters() |
||
| 254 | @see get_representors() |
||
| 255 | |||
| 256 | """ |
||
| 257 | |||
| 258 | return self.__means; |
||
| 259 | |||
| 260 | |||
| 261 | def __insert_cluster(self, cluster): |
||
| 262 | """! |
||
| 263 | @brief Insert cluster to the list (sorted queue) in line with sequence order (distance). |
||
| 264 | |||
| 265 | @param[in] cluster (cure_cluster): Cluster that should be inserted. |
||
| 266 | |||
| 267 | """ |
||
| 268 | |||
| 269 | for index in range(len(self.__queue)): |
||
| 270 | if (cluster.distance < self.__queue[index].distance): |
||
| 271 | self.__queue.insert(index, cluster); |
||
| 272 | return; |
||
| 273 | |||
| 274 | self.__queue.append(cluster); |
||
| 275 | |||
| 276 | |||
| 277 | def __relocate_cluster(self, cluster): |
||
| 278 | """! |
||
| 279 | @brief Relocate cluster in list in line with distance order. |
||
| 280 | |||
| 281 | @param[in] cluster (cure_cluster): Cluster that should be relocated in line with order. |
||
| 282 | |||
| 283 | """ |
||
| 284 | |||
| 285 | self.__queue.remove(cluster); |
||
| 286 | self.__insert_cluster(cluster); |
||
| 287 | |||
| 288 | |||
| 289 | def __closest_cluster(self, cluster, distance): |
||
| 290 | """! |
||
| 291 | @brief Find closest cluster to the specified cluster in line with distance. |
||
| 292 | |||
| 293 | @param[in] cluster (cure_cluster): Cluster for which nearest cluster should be found. |
||
| 294 | @param[in] distance (double): Closest distance to the previous cluster. |
||
| 295 | |||
| 296 | @return (tuple) Pair (nearest CURE cluster, nearest distance) if the nearest cluster has been found, otherwise None is returned. |
||
| 297 | |||
| 298 | """ |
||
| 299 | |||
| 300 | nearest_cluster = None; |
||
| 301 | nearest_distance = float('inf');
|
||
| 302 | |||
| 303 | for point in cluster.rep: |
||
| 304 | # Nearest nodes should be returned (at least it will return itself). |
||
| 305 | nearest_nodes = self.__tree.find_nearest_dist_nodes(point, distance); |
||
| 306 | for (candidate_distance, kdtree_node) in nearest_nodes: |
||
| 307 | if ( (candidate_distance < nearest_distance) and (kdtree_node is not None) and (kdtree_node.payload is not cluster) ): |
||
| 308 | nearest_distance = candidate_distance; |
||
| 309 | nearest_cluster = kdtree_node.payload; |
||
| 310 | |||
| 311 | return (nearest_cluster, nearest_distance); |
||
| 312 | |||
| 313 | |||
| 314 | def __insert_represented_points(self, cluster): |
||
| 315 | """! |
||
| 316 | @brief Insert representation points to the k-d tree. |
||
| 317 | |||
| 318 | @param[in] cluster (cure_cluster): Cluster whose representation points should be inserted. |
||
| 319 | |||
| 320 | """ |
||
| 321 | |||
| 322 | for point in cluster.rep: |
||
| 323 | self.__tree.insert(point, cluster); |
||
| 324 | |||
| 325 | |||
| 326 | def __delete_represented_points(self, cluster): |
||
| 327 | """! |
||
| 328 | @brief Remove representation points of clusters from the k-d tree |
||
| 329 | |||
| 330 | @param[in] cluster (cure_cluster): Cluster whose representation points should be removed. |
||
| 331 | |||
| 332 | """ |
||
| 333 | |||
| 334 | for point in cluster.rep: |
||
| 335 | self.__tree.remove(point); |
||
| 336 | |||
| 337 | |||
| 338 | def __merge_clusters(self, cluster1, cluster2): |
||
| 339 | """! |
||
| 340 | @brief Merges two clusters and returns new merged cluster. Representation points and mean points are calculated for the new cluster. |
||
| 341 | |||
| 342 | @param[in] cluster1 (cure_cluster): Cluster that should be merged. |
||
| 343 | @param[in] cluster2 (cure_cluster): Cluster that should be merged. |
||
| 344 | |||
| 345 | @return (cure_cluster) New merged CURE cluster. |
||
| 346 | |||
| 347 | """ |
||
| 348 | |||
| 349 | merged_cluster = cure_cluster(); |
||
| 350 | |||
| 351 | merged_cluster.points = cluster1.points + cluster2.points; |
||
| 352 | |||
| 353 | # merged_cluster.mean = ( len(cluster1.points) * cluster1.mean + len(cluster2.points) * cluster2.mean ) / ( len(cluster1.points) + len(cluster2.points) ); |
||
| 354 | dimension = len(cluster1.mean); |
||
| 355 | merged_cluster.mean = [0] * dimension; |
||
| 356 | if merged_cluster.points[1:] == merged_cluster.points[:-1]: |
||
| 357 | merged_cluster.mean = merged_cluster.points[0] |
||
| 358 | else: |
||
| 359 | for index in range(dimension): |
||
| 360 | merged_cluster.mean[index] = ( len(cluster1.points) * cluster1.mean[index] + len(cluster2.points) * cluster2.mean[index] ) / ( len(cluster1.points) + len(cluster2.points) ); |
||
| 361 | |||
| 362 | temporary = list(); # TODO: Set should be used in line with specification (article), but list is not hashable object therefore it's impossible to use list in this fucking set! |
||
| 363 | |||
| 364 | for index in range(self.__number_represent_points): |
||
| 365 | maximal_distance = 0; |
||
| 366 | maximal_point = None; |
||
| 367 | |||
| 368 | for point in merged_cluster.points: |
||
| 369 | minimal_distance = 0; |
||
| 370 | if (index == 0): |
||
| 371 | minimal_distance = euclidean_distance(point, merged_cluster.mean); |
||
| 372 | #minimal_distance = euclidean_distance_sqrt(point, merged_cluster.mean); |
||
| 373 | else: |
||
| 374 | minimal_distance = euclidean_distance(point, temporary[0]); |
||
| 375 | #minimal_distance = cluster_distance(cure_cluster(point), cure_cluster(temporary[0])); |
||
| 376 | |||
| 377 | if (minimal_distance >= maximal_distance): |
||
| 378 | maximal_distance = minimal_distance; |
||
| 379 | maximal_point = point; |
||
| 380 | |||
| 381 | if (maximal_point not in temporary): |
||
| 382 | temporary.append(maximal_point); |
||
| 383 | |||
| 384 | for point in temporary: |
||
| 385 | representative_point = [0] * dimension; |
||
| 386 | for index in range(dimension): |
||
| 387 | representative_point[index] = point[index] + self.__compression * (merged_cluster.mean[index] - point[index]); |
||
| 388 | |||
| 389 | merged_cluster.rep.append(representative_point); |
||
| 390 | |||
| 391 | return merged_cluster; |
||
| 392 | |||
| 393 | |||
| 394 | def __create_queue(self): |
||
| 395 | """! |
||
| 396 | @brief Create queue of sorted clusters by distance between them, where first cluster has the nearest neighbor. At the first iteration each cluster contains only one point. |
||
| 397 | |||
| 398 | @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple. |
||
| 399 | |||
| 400 | @return (list) Create queue of sorted clusters by distance between them. |
||
| 401 | |||
| 402 | """ |
||
| 403 | |||
| 404 | self.__queue = [cure_cluster(point) for point in self.__pointer_data]; |
||
| 405 | |||
| 406 | # set closest clusters |
||
| 407 | for i in range(0, len(self.__queue)): |
||
| 408 | minimal_distance = float('inf');
|
||
| 409 | closest_index_cluster = -1; |
||
| 410 | |||
| 411 | for k in range(0, len(self.__queue)): |
||
| 412 | if (i != k): |
||
| 413 | dist = self.__cluster_distance(self.__queue[i], self.__queue[k]); |
||
| 414 | if (dist < minimal_distance): |
||
| 415 | minimal_distance = dist; |
||
| 416 | closest_index_cluster = k; |
||
| 417 | |||
| 418 | self.__queue[i].closest = self.__queue[closest_index_cluster]; |
||
| 419 | self.__queue[i].distance = minimal_distance; |
||
| 420 | |||
| 421 | # sort clusters |
||
| 422 | self.__queue.sort(key = lambda x: x.distance, reverse = False); |
||
| 423 | |||
| 424 | |||
| 425 | def __create_kdtree(self): |
||
| 426 | """! |
||
| 427 | @brief Create k-d tree in line with created clusters. At the first iteration contains all points from the input data set. |
||
| 428 | |||
| 429 | @return (kdtree) k-d tree that consist of representative points of CURE clusters. |
||
| 430 | |||
| 431 | """ |
||
| 432 | |||
| 433 | self.__tree = kdtree(); |
||
| 434 | for current_cluster in self.__queue: |
||
| 435 | for representative_point in current_cluster.rep: |
||
| 436 | self.__tree.insert(representative_point, current_cluster); |
||
| 437 | |||
| 438 | |||
| 439 | def __cluster_distance(self, cluster1, cluster2): |
||
| 440 | """! |
||
| 441 | @brief Calculate minimal distance between clusters using representative points. |
||
| 442 | |||
| 443 | @param[in] cluster1 (cure_cluster): The first cluster. |
||
| 444 | @param[in] cluster2 (cure_cluster): The second cluster. |
||
| 445 | |||
| 446 | @return (double) Euclidean distance between two clusters that is defined by minimum distance between representation points of two clusters. |
||
| 447 | |||
| 448 | """ |
||
| 449 | |||
| 450 | distance = float('inf');
|
||
| 451 | for i in range(0, len(cluster1.rep)): |
||
| 452 | for k in range(0, len(cluster2.rep)): |
||
| 453 | #dist = euclidean_distance_sqrt(cluster1.rep[i], cluster2.rep[k]); # Fast mode |
||
| 454 | dist = euclidean_distance(cluster1.rep[i], cluster2.rep[k]); # Slow mode |
||
| 455 | if (dist < distance): |
||
| 456 | distance = dist; |
||
| 457 | |||
| 458 | return distance; |
||
| 459 |