| Total Complexity | 24 |
| Total Lines | 152 |
| Duplicated Lines | 38.82 % |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
| 1 | """! |
||
| 34 | class kmedians: |
||
| 35 | """! |
||
| 36 | @brief Class represents clustering algorithm K-Medians. |
||
| 37 | @details The algorithm is less sensitive to outliers than K-Means. Medians are calculated instead of centroids. |
||
| 38 | |||
| 39 | Example: |
||
| 40 | @code |
||
| 41 | # load list of points for cluster analysis |
||
| 42 | sample = read_sample(path); |
||
| 43 | |||
| 44 | # create instance of K-Medians algorithm |
||
| 45 | kmedians_instance = kmedians(sample, [ [0.0, 0.1], [2.5, 2.6] ]); |
||
| 46 | |||
| 47 | # run cluster analysis and obtain results |
||
| 48 | kmedians_instance.process(); |
||
| 49 | kmedians_instance.get_clusters(); |
||
| 50 | @endcode |
||
| 51 | |||
| 52 | """ |
||
| 53 | |||
| 54 | def __init__(self, data, initial_centers, tolerance = 0.25, ccore = False): |
||
| 55 | """! |
||
| 56 | @brief Constructor of clustering algorithm K-Medians. |
||
| 57 | |||
| 58 | @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple. |
||
| 59 | @param[in] initial_centers (list): Initial coordinates of medians of clusters that are represented by list: [center1, center2, ...]. |
||
| 60 | @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing |
||
| 61 | @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not. |
||
| 62 | |||
| 63 | """ |
||
| 64 | self.__pointer_data = data; |
||
| 65 | self.__clusters = []; |
||
| 66 | self.__medians = initial_centers[:]; |
||
| 67 | self.__tolerance = tolerance; |
||
| 68 | self.__ccore = ccore; |
||
| 69 | |||
| 70 | |||
| 71 | View Code Duplication | def process(self): |
|
|
|
|||
| 72 | """! |
||
| 73 | @brief Performs cluster analysis in line with rules of K-Medians algorithm. |
||
| 74 | |||
| 75 | @remark Results of clustering can be obtained using corresponding get methods. |
||
| 76 | |||
| 77 | @see get_clusters() |
||
| 78 | @see get_medians() |
||
| 79 | |||
| 80 | """ |
||
| 81 | |||
| 82 | if (self.__ccore is True): |
||
| 83 | self.__clusters = wrapper.kmedians(self.__pointer_data, self.__medians, self.__tolerance); |
||
| 84 | self.__medians = self.__update_medians(); |
||
| 85 | |||
| 86 | else: |
||
| 87 | changes = float('inf');
|
||
| 88 | |||
| 89 | stop_condition = self.__tolerance * self.__tolerance; # Fast solution |
||
| 90 | #stop_condition = self.__tolerance; # Slow solution |
||
| 91 | |||
| 92 | # Check for dimension |
||
| 93 | if (len(self.__pointer_data[0]) != len(self.__medians[0])): |
||
| 94 | raise NameError('Dimension of the input data and dimension of the initial cluster medians must be equal.');
|
||
| 95 | |||
| 96 | while (changes > stop_condition): |
||
| 97 | self.__clusters = self.__update_clusters(); |
||
| 98 | updated_centers = self.__update_medians(); # changes should be calculated before asignment |
||
| 99 | |||
| 100 | changes = max([euclidean_distance_sqrt(self.__medians[index], updated_centers[index]) for index in range(len(updated_centers))]); # Fast solution |
||
| 101 | |||
| 102 | self.__medians = updated_centers; |
||
| 103 | |||
| 104 | |||
| 105 | def get_clusters(self): |
||
| 106 | """! |
||
| 107 | @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data. |
||
| 108 | |||
| 109 | @see process() |
||
| 110 | @see get_medians() |
||
| 111 | |||
| 112 | """ |
||
| 113 | |||
| 114 | return self.__clusters; |
||
| 115 | |||
| 116 | |||
| 117 | def get_medians(self): |
||
| 118 | """! |
||
| 119 | @brief Returns list of centers of allocated clusters. |
||
| 120 | |||
| 121 | @see process() |
||
| 122 | @see get_clusters() |
||
| 123 | |||
| 124 | """ |
||
| 125 | |||
| 126 | return self.__medians; |
||
| 127 | |||
| 128 | |||
| 129 | View Code Duplication | def __update_clusters(self): |
|
| 130 | """! |
||
| 131 | @brief Calculate Manhattan distance to each point from the each cluster. |
||
| 132 | @details Nearest points are captured by according clusters and as a result clusters are updated. |
||
| 133 | |||
| 134 | @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data. |
||
| 135 | |||
| 136 | """ |
||
| 137 | |||
| 138 | clusters = [[] for i in range(len(self.__medians))]; |
||
| 139 | for index_point in range(len(self.__pointer_data)): |
||
| 140 | index_optim = -1; |
||
| 141 | dist_optim = 0.0; |
||
| 142 | |||
| 143 | for index in range(len(self.__medians)): |
||
| 144 | dist = euclidean_distance_sqrt(self.__pointer_data[index_point], self.__medians[index]); |
||
| 145 | |||
| 146 | if ( (dist < dist_optim) or (index is 0)): |
||
| 147 | index_optim = index; |
||
| 148 | dist_optim = dist; |
||
| 149 | |||
| 150 | clusters[index_optim].append(index_point); |
||
| 151 | |||
| 152 | # If cluster is not able to capture object it should be removed |
||
| 153 | clusters = [cluster for cluster in clusters if len(cluster) > 0]; |
||
| 154 | |||
| 155 | return clusters; |
||
| 156 | |||
| 157 | |||
| 158 | def __update_medians(self): |
||
| 159 | """! |
||
| 160 | @brief Calculate medians of clusters in line with contained objects. |
||
| 161 | |||
| 162 | @return (list) list of medians for current number of clusters. |
||
| 163 | |||
| 164 | """ |
||
| 165 | |||
| 166 | medians = [[] for i in range(len(self.__clusters))]; |
||
| 167 | |||
| 168 | for index in range(len(self.__clusters)): |
||
| 169 | medians[index] = [ 0.0 for i in range(len(self.__pointer_data[0]))]; |
||
| 170 | length_cluster = len(self.__clusters[index]); |
||
| 171 | |||
| 172 | for index_dimension in range(len(self.__pointer_data[0])): |
||
| 173 | sorted_cluster = sorted(self.__clusters[index], key = lambda x: self.__pointer_data[x][index_dimension]); |
||
| 174 | |||
| 175 | relative_index_median = math.floor(length_cluster / 2); |
||
| 176 | index_median = sorted_cluster[relative_index_median]; |
||
| 177 | |||
| 178 | if ( (length_cluster % 2) and (relative_index_median + 1 < len(sorted_cluster)) ): |
||
| 179 | index_median_second = sorted_cluster[relative_index_median + 1]; |
||
| 180 | medians[index][index_dimension] = (self.__pointer_data[index_median][index_dimension] + self.__pointer_data[index_median_second][index_dimension]) / 2.0; |
||
| 181 | |||
| 182 | else: |
||
| 183 | medians[index][index_dimension] = self.__pointer_data[index_median][index_dimension]; |
||
| 184 | |||
| 185 | return medians; |