Code Duplication    Length = 27-34 lines in 4 locations

pyclustering/cluster/kmedians.py 1 location

@@ 129-155 (lines=27) @@
126
        @see get_clusters()
127
        
128
        """
129
130
        return self.__medians;
131
132
133
    def get_cluster_encoding(self):
134
        """!
135
        @brief Returns clustering result representation type that indicate how clusters are encoded.
136
        
137
        @return (type_encoding) Clustering result representation.
138
        
139
        @see get_clusters()
140
        
141
        """
142
        
143
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
144
145
146
    def __update_clusters(self):
147
        """!
148
        @brief Calculate Manhattan distance to each point from the each cluster. 
149
        @details Nearest points are captured by according clusters and as a result clusters are updated.
150
        
151
        @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
152
        
153
        """
154
        
155
        clusters = [[] for i in range(len(self.__medians))];
156
        for index_point in range(len(self.__pointer_data)):
157
            index_optim = -1;
158
            dist_optim = 0.0;

pyclustering/cluster/xmeans.py 1 location

@@ 361-394 (lines=34) @@
358
            
359
            Kw = (1.0 - K / N) * sigma_sqrt;
360
            Ks = ( 2.0 * alpha * sigma / (N ** 0.5) ) * ( (alpha ** 2.0) * sigma_sqrt / N + W - Kw / 2.0 ) ** 0.5;
361
            
362
            scores = sigma_sqrt * (2 * K)**0.5 * ((2 * K)**0.5 + betta) / N + W - sigma_sqrt + Ks + 2 * alpha**0.5 * sigma_sqrt / N
363
        
364
        return scores;
365
366
367
    def __bayesian_information_criterion(self, clusters, centers):
368
        """!
369
        @brief Calculates splitting criterion for input clusters using bayesian information criterion.
370
        
371
        @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
372
        @param[in] centers (list): Centers of the clusters.
373
        
374
        @return (double) Splitting criterion in line with bayesian information criterion.
375
                High value of splitting criterion means that current structure is much better.
376
                
377
        @see __minimum_noiseless_description_length(clusters, centers)
378
        
379
        """
380
381
        scores = [float('inf')] * len(clusters)     # splitting criterion
382
        dimension = len(self.__pointer_data[0]);
383
          
384
        # estimation of the noise variance in the data set
385
        sigma_sqrt = 0.0;
386
        K = len(clusters);
387
        N = 0.0;
388
          
389
        for index_cluster in range(0, len(clusters), 1):
390
            for index_object in clusters[index_cluster]:
391
                sigma_sqrt += euclidean_distance_sqrt(self.__pointer_data[index_object], centers[index_cluster]);
392
393
            N += len(clusters[index_cluster]);
394
      
395
        if (N - K > 0):
396
            sigma_sqrt /= (N - K);
397
            p = (K - 1) + dimension * K + 1;

pyclustering/cluster/kmeans.py 1 location

@@ 126-152 (lines=27) @@
123
                #changes = max([euclidean_distance(self.__centers[index], updated_centers[index]) for index in range(len(self.__centers))]);        # Slow solution
124
                changes = max([euclidean_distance_sqrt(self.__centers[index], updated_centers[index]) for index in range(len(updated_centers))]);    # Fast solution
125
                 
126
                self.__centers = updated_centers;
127
128
129
    def get_clusters(self):
130
        """!
131
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
132
        
133
        @see process()
134
        @see get_centers()
135
        
136
        """
137
        
138
        return self.__clusters;
139
    
140
    
141
    def get_centers(self):
142
        """!
143
        @brief Returns list of centers of allocated clusters.
144
        
145
        @see process()
146
        @see get_clusters()
147
        
148
        """
149
150
        return self.__centers;
151
152
153
    def get_cluster_encoding(self):
154
        """!
155
        @brief Returns clustering result representation type that indicate how clusters are encoded.

pyclustering/cluster/kmedoids.py 1 location

@@ 146-175 (lines=30) @@
143
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
144
145
146
    def __update_clusters(self):
147
        """!
148
        @brief Calculate distance to each point from the each cluster. 
149
        @details Nearest points are captured by according clusters and as a result clusters are updated.
150
        
151
        @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
152
        
153
        """
154
        
155
        clusters = [[self.__medoid_indexes[i]] for i in range(len(self.__medoids))];
156
        for index_point in range(len(self.__pointer_data)):
157
            if (index_point in self.__medoid_indexes):
158
                continue;
159
160
            index_optim = -1;
161
            dist_optim = float('Inf');
162
            
163
            for index in range(len(self.__medoids)):
164
                dist = euclidean_distance_sqrt(self.__pointer_data[index_point], self.__medoids[index]);
165
                
166
                if ( (dist < dist_optim) or (index is 0)):
167
                    index_optim = index;
168
                    dist_optim = dist;
169
                
170
            clusters[index_optim].append(index_point);
171
        
172
        # If cluster is not able to capture object it should be removed
173
        clusters = [cluster for cluster in clusters if len(cluster) > 0];
174
        
175
        return clusters;
176
    
177
    
178
    def __update_medoids(self):