| Total Complexity | 54 |
| Total Lines | 348 |
| Duplicated Lines | 0 % |
Complex classes like pyclustering.cluster.cure often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | """! |
||
| 83 | class cure: |
||
| 84 | """! |
||
| 85 | @brief Class represents clustering algorithm CURE. |
||
| 86 | |||
| 87 | Example: |
||
| 88 | @code |
||
| 89 | # read data for clustering from some file |
||
| 90 | sample = read_sample(path_to_data); |
||
| 91 | |||
| 92 | # create instance of cure algorithm for cluster analysis |
||
| 93 | # request for allocation of two clusters. |
||
| 94 | cure_instance = cure(sample, 2, 5, 0.5, True); |
||
| 95 | |||
| 96 | # run cluster analysis |
||
| 97 | cure_instance.process(); |
||
| 98 | |||
| 99 | # get results of clustering |
||
| 100 | clusters = cure_instance.get_clusters(); |
||
| 101 | @endcode |
||
| 102 | |||
| 103 | """ |
||
| 104 | __pointer_data = None; |
||
| 105 | __clusters = None; |
||
| 106 | |||
| 107 | __number_cluster = 0; |
||
| 108 | __number_represent_points = 0; |
||
| 109 | __compression = 0; |
||
| 110 | |||
| 111 | __queue = None; |
||
| 112 | __tree = None; |
||
| 113 | |||
| 114 | __ccore = False; |
||
| 115 | |||
| 116 | def __init__(self, data, number_cluster, number_represent_points = 5, compression = 0.5, ccore = False): |
||
| 117 | """! |
||
| 118 | @brief Constructor of clustering algorithm CURE. |
||
| 119 | |||
| 120 | @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple. |
||
| 121 | @param[in] number_cluster (uint): Number of clusters that should be allocated. |
||
| 122 | @param[in] number_represent_points (uint): Number of representative points for each cluster. |
||
| 123 | @param[in] compression (double): Coefficient defines level of shrinking of representation points toward the mean of the new created cluster after merging on each step. Usually it destributed from 0 to 1. |
||
| 124 | @param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving. |
||
| 125 | |||
| 126 | """ |
||
| 127 | |||
| 128 | self.__pointer_data = data; |
||
| 129 | self.__number_cluster = number_cluster; |
||
| 130 | self.__number_represent_points = number_represent_points; |
||
| 131 | self.__compression = compression; |
||
| 132 | |||
| 133 | self.__ccore = ccore; |
||
| 134 | |||
| 135 | if (ccore is False): |
||
| 136 | self.__create_queue(); # queue |
||
| 137 | self.__create_kdtree(); # create k-d tree |
||
| 138 | |||
| 139 | |||
| 140 | def process(self): |
||
| 141 | """! |
||
| 142 | @brief Performs cluster analysis in line with rules of CURE algorithm. |
||
| 143 | |||
| 144 | @remark Results of clustering can be obtained using corresponding get methods. |
||
| 145 | |||
| 146 | @see get_clusters() |
||
| 147 | |||
| 148 | """ |
||
| 149 | |||
| 150 | if (self.__ccore is True): |
||
| 151 | self.__clusters = wrapper.cure(self.__pointer_data, self.__number_cluster, self.__number_represent_points, self.__compression); |
||
| 152 | else: |
||
| 153 | |||
| 154 | while (len(self.__queue) > self.__number_cluster): |
||
| 155 | cluster1 = self.__queue[0]; # cluster that has nearest neighbor. |
||
| 156 | cluster2 = cluster1.closest; # closest cluster. |
||
| 157 | |||
| 158 | #print("Merge decision: \n\t", cluster1, "\n\t", cluster2);
|
||
| 159 | |||
| 160 | self.__queue.remove(cluster1); |
||
| 161 | self.__queue.remove(cluster2); |
||
| 162 | |||
| 163 | self.__delete_represented_points(cluster1); |
||
| 164 | self.__delete_represented_points(cluster2); |
||
| 165 | |||
| 166 | merged_cluster = self.__merge_clusters(cluster1, cluster2); |
||
| 167 | |||
| 168 | self.__insert_represented_points(merged_cluster); |
||
| 169 | |||
| 170 | # Pointers to clusters that should be relocated is stored here. |
||
| 171 | cluster_relocation_requests = []; |
||
| 172 | |||
| 173 | # Check for the last cluster |
||
| 174 | if (len(self.__queue) > 0): |
||
| 175 | merged_cluster.closest = self.__queue[0]; # arbitrary cluster from queue |
||
| 176 | merged_cluster.distance = self.__cluster_distance(merged_cluster, merged_cluster.closest); |
||
| 177 | |||
| 178 | for item in self.__queue: |
||
| 179 | distance = self.__cluster_distance(merged_cluster, item); |
||
| 180 | # Check if distance between new cluster and current is the best than now. |
||
| 181 | if (distance < merged_cluster.distance): |
||
| 182 | merged_cluster.closest = item; |
||
| 183 | merged_cluster.distance = distance; |
||
| 184 | |||
| 185 | # Check if current cluster has removed neighbor. |
||
| 186 | if ( (item.closest is cluster1) or (item.closest is cluster2) ): |
||
| 187 | # If previous distance was less then distance to new cluster then nearest cluster should be found in the tree. |
||
| 188 | #print("Update: ", item);
|
||
| 189 | if (item.distance < distance): |
||
| 190 | (item.closest, item.distance) = self.__closest_cluster(item, distance); |
||
| 191 | |||
| 192 | # TODO: investigation of root cause is required. |
||
| 193 | # Itself and merged cluster should be always in list of neighbors in line with specified radius. |
||
| 194 | # But merged cluster may not be in list due to error calculation, therefore it should be added |
||
| 195 | # manually. |
||
| 196 | if (item.closest is None): |
||
| 197 | item.closest = merged_cluster; |
||
| 198 | item.distance = distance; |
||
| 199 | |||
| 200 | # Otherwise new cluster is nearest. |
||
| 201 | else: |
||
| 202 | item.closest = merged_cluster; |
||
| 203 | item.distance = distance; |
||
| 204 | |||
| 205 | cluster_relocation_requests.append(item); |
||
| 206 | elif (item.distance > distance): |
||
| 207 | item.closest = merged_cluster; |
||
| 208 | item.ditance = distance; |
||
| 209 | |||
| 210 | cluster_relocation_requests.append(item); |
||
| 211 | |||
| 212 | # New cluster and updated clusters should relocated in queue |
||
| 213 | self.__insert_cluster(merged_cluster); |
||
| 214 | [self.__relocate_cluster(item) for item in cluster_relocation_requests]; |
||
| 215 | |||
| 216 | # Change cluster representation |
||
| 217 | self.__clusters = [ cure_cluster_unit.points for cure_cluster_unit in self.__queue ]; |
||
| 218 | |||
| 219 | |||
| 220 | def get_clusters(self): |
||
| 221 | """! |
||
| 222 | @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data. |
||
| 223 | |||
| 224 | @return (list) List of allocated clusters. |
||
| 225 | |||
| 226 | @see process() |
||
| 227 | |||
| 228 | """ |
||
| 229 | |||
| 230 | return self.__clusters; |
||
| 231 | |||
| 232 | |||
| 233 | def __insert_cluster(self, cluster): |
||
| 234 | """! |
||
| 235 | @brief Insert cluster to the list (sorted queue) in line with sequence order (distance). |
||
| 236 | |||
| 237 | @param[in] cluster (cure_cluster): Cluster that should be inserted. |
||
| 238 | |||
| 239 | """ |
||
| 240 | |||
| 241 | for index in range(len(self.__queue)): |
||
| 242 | if (cluster.distance < self.__queue[index].distance): |
||
| 243 | self.__queue.insert(index, cluster); |
||
| 244 | return; |
||
| 245 | |||
| 246 | self.__queue.append(cluster); |
||
| 247 | |||
| 248 | |||
| 249 | def __relocate_cluster(self, cluster): |
||
| 250 | """! |
||
| 251 | @brief Relocate cluster in list in line with distance order. |
||
| 252 | |||
| 253 | @param[in] cluster (cure_cluster): Cluster that should be relocated in line with order. |
||
| 254 | |||
| 255 | """ |
||
| 256 | |||
| 257 | self.__queue.remove(cluster); |
||
| 258 | self.__insert_cluster(cluster); |
||
| 259 | |||
| 260 | |||
| 261 | def __closest_cluster(self, cluster, distance): |
||
| 262 | """! |
||
| 263 | @brief Find closest cluster to the specified cluster in line with distance. |
||
| 264 | |||
| 265 | @param[in] cluster (cure_cluster): Cluster for which nearest cluster should be found. |
||
| 266 | @param[in] distance (double): Closest distance to the previous cluster. |
||
| 267 | |||
| 268 | @return (tuple) Pair (nearest CURE cluster, nearest distance) if the nearest cluster has been found, otherwise None is returned. |
||
| 269 | |||
| 270 | """ |
||
| 271 | |||
| 272 | nearest_cluster = None; |
||
| 273 | nearest_distance = float('inf');
|
||
| 274 | |||
| 275 | for point in cluster.rep: |
||
| 276 | # Nearest nodes should be returned (at least it will return itself). |
||
| 277 | nearest_nodes = self.__tree.find_nearest_dist_nodes(point, distance); |
||
| 278 | for (candidate_distance, kdtree_node) in nearest_nodes: |
||
| 279 | if ( (candidate_distance < nearest_distance) and (kdtree_node is not None) and (kdtree_node.payload is not cluster) ): |
||
| 280 | nearest_distance = candidate_distance; |
||
| 281 | nearest_cluster = kdtree_node.payload; |
||
| 282 | |||
| 283 | return (nearest_cluster, nearest_distance); |
||
| 284 | |||
| 285 | |||
| 286 | def __insert_represented_points(self, cluster): |
||
| 287 | """! |
||
| 288 | @brief Insert representation points to the k-d tree. |
||
| 289 | |||
| 290 | @param[in] cluster (cure_cluster): Cluster whose representation points should be inserted. |
||
| 291 | |||
| 292 | """ |
||
| 293 | |||
| 294 | for point in cluster.rep: |
||
| 295 | self.__tree.insert(point, cluster); |
||
| 296 | |||
| 297 | |||
| 298 | def __delete_represented_points(self, cluster): |
||
| 299 | """! |
||
| 300 | @brief Remove representation points of clusters from the k-d tree |
||
| 301 | |||
| 302 | @param[in] cluster (cure_cluster): Cluster whose representation points should be removed. |
||
| 303 | |||
| 304 | """ |
||
| 305 | |||
| 306 | for point in cluster.rep: |
||
| 307 | self.__tree.remove(point); |
||
| 308 | |||
| 309 | |||
| 310 | def __merge_clusters(self, cluster1, cluster2): |
||
| 311 | """! |
||
| 312 | @brief Merges two clusters and returns new merged cluster. Representation points and mean points are calculated for the new cluster. |
||
| 313 | |||
| 314 | @param[in] cluster1 (cure_cluster): Cluster that should be merged. |
||
| 315 | @param[in] cluster2 (cure_cluster): Cluster that should be merged. |
||
| 316 | |||
| 317 | @return (cure_cluster) New merged CURE cluster. |
||
| 318 | |||
| 319 | """ |
||
| 320 | |||
| 321 | merged_cluster = cure_cluster(); |
||
| 322 | |||
| 323 | merged_cluster.points = cluster1.points + cluster2.points; |
||
| 324 | |||
| 325 | # merged_cluster.mean = ( len(cluster1.points) * cluster1.mean + len(cluster2.points) * cluster2.mean ) / ( len(cluster1.points) + len(cluster2.points) ); |
||
| 326 | dimension = len(cluster1.mean); |
||
| 327 | merged_cluster.mean = [0] * dimension; |
||
| 328 | if merged_cluster.points[1:] == merged_cluster.points[:-1]: |
||
| 329 | merged_cluster.mean = merged_cluster.points[0] |
||
| 330 | else: |
||
| 331 | for index in range(dimension): |
||
| 332 | merged_cluster.mean[index] = ( len(cluster1.points) * cluster1.mean[index] + len(cluster2.points) * cluster2.mean[index] ) / ( len(cluster1.points) + len(cluster2.points) ); |
||
| 333 | |||
| 334 | temporary = list(); # TODO: Set should be used in line with specification (article), but list is not hashable object therefore it's impossible to use list in this fucking set! |
||
| 335 | |||
| 336 | for index in range(self.__number_represent_points): |
||
| 337 | maximal_distance = 0; |
||
| 338 | maximal_point = None; |
||
| 339 | |||
| 340 | for point in merged_cluster.points: |
||
| 341 | minimal_distance = 0; |
||
| 342 | if (index == 0): |
||
| 343 | minimal_distance = euclidean_distance(point, merged_cluster.mean); |
||
| 344 | #minimal_distance = euclidean_distance_sqrt(point, merged_cluster.mean); |
||
| 345 | else: |
||
| 346 | minimal_distance = euclidean_distance(point, temporary[0]); |
||
| 347 | #minimal_distance = cluster_distance(cure_cluster(point), cure_cluster(temporary[0])); |
||
| 348 | |||
| 349 | if (minimal_distance >= maximal_distance): |
||
| 350 | maximal_distance = minimal_distance; |
||
| 351 | maximal_point = point; |
||
| 352 | |||
| 353 | if (maximal_point not in temporary): |
||
| 354 | temporary.append(maximal_point); |
||
| 355 | |||
| 356 | for point in temporary: |
||
| 357 | representative_point = [0] * dimension; |
||
| 358 | for index in range(dimension): |
||
| 359 | representative_point[index] = point[index] + self.__compression * (merged_cluster.mean[index] - point[index]); |
||
| 360 | |||
| 361 | merged_cluster.rep.append(representative_point); |
||
| 362 | |||
| 363 | return merged_cluster; |
||
| 364 | |||
| 365 | |||
| 366 | def __create_queue(self): |
||
| 367 | """! |
||
| 368 | @brief Create queue of sorted clusters by distance between them, where first cluster has the nearest neighbor. At the first iteration each cluster contains only one point. |
||
| 369 | |||
| 370 | @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple. |
||
| 371 | |||
| 372 | @return (list) Create queue of sorted clusters by distance between them. |
||
| 373 | |||
| 374 | """ |
||
| 375 | |||
| 376 | self.__queue = [cure_cluster(point) for point in self.__pointer_data]; |
||
| 377 | |||
| 378 | # set closest clusters |
||
| 379 | for i in range(0, len(self.__queue)): |
||
| 380 | minimal_distance = float('inf');
|
||
| 381 | closest_index_cluster = -1; |
||
| 382 | |||
| 383 | for k in range(0, len(self.__queue)): |
||
| 384 | if (i != k): |
||
| 385 | dist = self.__cluster_distance(self.__queue[i], self.__queue[k]); |
||
| 386 | if (dist < minimal_distance): |
||
| 387 | minimal_distance = dist; |
||
| 388 | closest_index_cluster = k; |
||
| 389 | |||
| 390 | self.__queue[i].closest = self.__queue[closest_index_cluster]; |
||
| 391 | self.__queue[i].distance = minimal_distance; |
||
| 392 | |||
| 393 | # sort clusters |
||
| 394 | self.__queue.sort(key = lambda x: x.distance, reverse = False); |
||
| 395 | |||
| 396 | |||
| 397 | def __create_kdtree(self): |
||
| 398 | """! |
||
| 399 | @brief Create k-d tree in line with created clusters. At the first iteration contains all points from the input data set. |
||
| 400 | |||
| 401 | @return (kdtree) k-d tree that consist of representative points of CURE clusters. |
||
| 402 | |||
| 403 | """ |
||
| 404 | |||
| 405 | self.__tree = kdtree(); |
||
| 406 | for current_cluster in self.__queue: |
||
| 407 | for representative_point in current_cluster.rep: |
||
| 408 | self.__tree.insert(representative_point, current_cluster); |
||
| 409 | |||
| 410 | |||
| 411 | def __cluster_distance(self, cluster1, cluster2): |
||
| 412 | """! |
||
| 413 | @brief Calculate minimal distance between clusters using representative points. |
||
| 414 | |||
| 415 | @param[in] cluster1 (cure_cluster): The first cluster. |
||
| 416 | @param[in] cluster2 (cure_cluster): The second cluster. |
||
| 417 | |||
| 418 | @return (double) Euclidean distance between two clusters that is defined by minimum distance between representation points of two clusters. |
||
| 419 | |||
| 420 | """ |
||
| 421 | |||
| 422 | distance = float('inf');
|
||
| 423 | for i in range(0, len(cluster1.rep)): |
||
| 424 | for k in range(0, len(cluster2.rep)): |
||
| 425 | #dist = euclidean_distance_sqrt(cluster1.rep[i], cluster2.rep[k]); # Fast mode |
||
| 426 | dist = euclidean_distance(cluster1.rep[i], cluster2.rep[k]); # Slow mode |
||
| 427 | if (dist < distance): |
||
| 428 | distance = dist; |
||
| 429 | |||
| 430 | return distance; |
||
| 431 |