| @@ 129-155 (lines=27) @@ | ||
| 126 | @see get_clusters() |
|
| 127 | ||
| 128 | """ |
|
| 129 | ||
| 130 | return self.__medians; |
|
| 131 | ||
| 132 | ||
| 133 | def get_cluster_encoding(self): |
|
| 134 | """! |
|
| 135 | @brief Returns clustering result representation type that indicate how clusters are encoded. |
|
| 136 | ||
| 137 | @return (type_encoding) Clustering result representation. |
|
| 138 | ||
| 139 | @see get_clusters() |
|
| 140 | ||
| 141 | """ |
|
| 142 | ||
| 143 | return type_encoding.CLUSTER_INDEX_LIST_SEPARATION; |
|
| 144 | ||
| 145 | ||
| 146 | def __update_clusters(self): |
|
| 147 | """! |
|
| 148 | @brief Calculate Manhattan distance to each point from the each cluster. |
|
| 149 | @details Nearest points are captured by according clusters and as a result clusters are updated. |
|
| 150 | ||
| 151 | @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data. |
|
| 152 | ||
| 153 | """ |
|
| 154 | ||
| 155 | clusters = [[] for i in range(len(self.__medians))]; |
|
| 156 | for index_point in range(len(self.__pointer_data)): |
|
| 157 | index_optim = -1; |
|
| 158 | dist_optim = 0.0; |
|
| @@ 361-394 (lines=34) @@ | ||
| 358 | ||
| 359 | Kw = (1.0 - K / N) * sigma_sqrt; |
|
| 360 | Ks = ( 2.0 * alpha * sigma / (N ** 0.5) ) * ( (alpha ** 2.0) * sigma_sqrt / N + W - Kw / 2.0 ) ** 0.5; |
|
| 361 | ||
| 362 | scores = sigma_sqrt * (2 * K)**0.5 * ((2 * K)**0.5 + betta) / N + W - sigma_sqrt + Ks + 2 * alpha**0.5 * sigma_sqrt / N |
|
| 363 | ||
| 364 | return scores; |
|
| 365 | ||
| 366 | ||
| 367 | def __bayesian_information_criterion(self, clusters, centers): |
|
| 368 | """! |
|
| 369 | @brief Calculates splitting criterion for input clusters using bayesian information criterion. |
|
| 370 | ||
| 371 | @param[in] clusters (list): Clusters for which splitting criterion should be calculated. |
|
| 372 | @param[in] centers (list): Centers of the clusters. |
|
| 373 | ||
| 374 | @return (double) Splitting criterion in line with bayesian information criterion. |
|
| 375 | High value of splitting criterion means that current structure is much better. |
|
| 376 | ||
| 377 | @see __minimum_noiseless_description_length(clusters, centers) |
|
| 378 | ||
| 379 | """ |
|
| 380 | ||
| 381 | scores = [float('inf')] * len(clusters) # splitting criterion
|
|
| 382 | dimension = len(self.__pointer_data[0]); |
|
| 383 | ||
| 384 | # estimation of the noise variance in the data set |
|
| 385 | sigma_sqrt = 0.0; |
|
| 386 | K = len(clusters); |
|
| 387 | N = 0.0; |
|
| 388 | ||
| 389 | for index_cluster in range(0, len(clusters), 1): |
|
| 390 | for index_object in clusters[index_cluster]: |
|
| 391 | sigma_sqrt += euclidean_distance_sqrt(self.__pointer_data[index_object], centers[index_cluster]); |
|
| 392 | ||
| 393 | N += len(clusters[index_cluster]); |
|
| 394 | ||
| 395 | if (N - K > 0): |
|
| 396 | sigma_sqrt /= (N - K); |
|
| 397 | p = (K - 1) + dimension * K + 1; |
|
| @@ 126-152 (lines=27) @@ | ||
| 123 | #changes = max([euclidean_distance(self.__centers[index], updated_centers[index]) for index in range(len(self.__centers))]); # Slow solution |
|
| 124 | changes = max([euclidean_distance_sqrt(self.__centers[index], updated_centers[index]) for index in range(len(updated_centers))]); # Fast solution |
|
| 125 | ||
| 126 | self.__centers = updated_centers; |
|
| 127 | ||
| 128 | ||
| 129 | def get_clusters(self): |
|
| 130 | """! |
|
| 131 | @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data. |
|
| 132 | ||
| 133 | @see process() |
|
| 134 | @see get_centers() |
|
| 135 | ||
| 136 | """ |
|
| 137 | ||
| 138 | return self.__clusters; |
|
| 139 | ||
| 140 | ||
| 141 | def get_centers(self): |
|
| 142 | """! |
|
| 143 | @brief Returns list of centers of allocated clusters. |
|
| 144 | ||
| 145 | @see process() |
|
| 146 | @see get_clusters() |
|
| 147 | ||
| 148 | """ |
|
| 149 | ||
| 150 | return self.__centers; |
|
| 151 | ||
| 152 | ||
| 153 | def get_cluster_encoding(self): |
|
| 154 | """! |
|
| 155 | @brief Returns clustering result representation type that indicate how clusters are encoded. |
|
| @@ 146-172 (lines=27) @@ | ||
| 143 | @return (type_encoding) Clustering result representation. |
|
| 144 | ||
| 145 | @see get_clusters() |
|
| 146 | ||
| 147 | """ |
|
| 148 | ||
| 149 | return type_encoding.CLUSTER_INDEX_LIST_SEPARATION; |
|
| 150 | ||
| 151 | ||
| 152 | def __update_clusters(self): |
|
| 153 | """! |
|
| 154 | @brief Calculate distance to each point from the each cluster. |
|
| 155 | @details Nearest points are captured by according clusters and as a result clusters are updated. |
|
| 156 | ||
| 157 | @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data. |
|
| 158 | ||
| 159 | """ |
|
| 160 | ||
| 161 | clusters = [[self.__medoid_indexes[i]] for i in range(len(self.__medoids))]; |
|
| 162 | for index_point in range(len(self.__pointer_data)): |
|
| 163 | if (index_point in self.__medoid_indexes): |
|
| 164 | continue; |
|
| 165 | ||
| 166 | index_optim = -1; |
|
| 167 | dist_optim = float('Inf');
|
|
| 168 | ||
| 169 | for index in range(len(self.__medoids)): |
|
| 170 | dist = euclidean_distance_sqrt(self.__pointer_data[index_point], self.__medoids[index]); |
|
| 171 | ||
| 172 | if ( (dist < dist_optim) or (index is 0)): |
|
| 173 | index_optim = index; |
|
| 174 | dist_optim = dist; |
|
| 175 | ||