kmedians - Code Metrics - Inspection of "[pyclustering.cluster.kmedians] Bug correction (is..." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 916c89...8edca8 )

by Andrei

created 2016-04-21 20:14 UTC

kmedians A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	152
Duplicated Lines	38.82 %

Metric	Value
dl	59
loc	152
rs	10
wmc	24

6 Methods

Rating	Name	Duplication	Size	Complexity
A	__init__()	0	15	1
B	process()	32	32	5
D	__update_clusters()	27	27	8
A	get_clusters()	0	10	1
A	get_medians()	0	10	1
D	__update_medians()	0	28	8

How to fix Duplicated Code

"""!

@brief Cluster analysis algorithm: K-Medians
@details Based on book description:
         - A.K. Jain, R.C Dubes, Algorithms for Clustering Data. 1988.

@authors Andrei Novikov ([email protected])
@date 2014-2016
@copyright GNU Public License

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""

import math;

from pyclustering.utils import euclidean_distance_sqrt;

import pyclustering.core.wrapper as wrapper;

class kmedians:
    """!
    @brief Class represents clustering algorithm K-Medians.
    @details The algorithm is less sensitive to outliers than K-Means. Medians are calculated instead of centroids.
    
    Example:
    @code
        # load list of points for cluster analysis
        sample = read_sample(path);
        
        # create instance of K-Medians algorithm
        kmedians_instance = kmedians(sample, [ [0.0, 0.1], [2.5, 2.6] ]);
        
        # run cluster analysis and obtain results
        kmedians_instance.process();
        kmedians_instance.get_clusters();    
    @endcode
    
    """
    
    def __init__(self, data, initial_centers, tolerance = 0.25, ccore = False):
        """!
        @brief Constructor of clustering algorithm K-Medians.
        
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
        @param[in] initial_centers (list): Initial coordinates of medians of clusters that are represented by list: [center1, center2, ...].
        @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing
        @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
        
        """
        self.__pointer_data = data;
        self.__clusters = [];
        self.__medians = initial_centers[:];
        self.__tolerance = tolerance;
        self.__ccore = ccore;


    def process(self):

        """!
        @brief Performs cluster analysis in line with rules of K-Medians algorithm.
        
        @remark Results of clustering can be obtained using corresponding get methods.
        
        @see get_clusters()
        @see get_medians()
        
        """
        
        if (self.__ccore is True):
            self.__clusters = wrapper.kmedians(self.__pointer_data, self.__medians, self.__tolerance);
            self.__medians = self.__update_medians();
            
        else:
            changes = float('inf');
             
            stop_condition = self.__tolerance * self.__tolerance;   # Fast solution
            #stop_condition = self.__tolerance;              # Slow solution
             
            # Check for dimension
            if (len(self.__pointer_data[0]) != len(self.__medians[0])):
                raise NameError('Dimension of the input data and dimension of the initial cluster medians must be equal.');
             
            while (changes > stop_condition):
                self.__clusters = self.__update_clusters();
                updated_centers = self.__update_medians();  # changes should be calculated before asignment
             
                changes = max([euclidean_distance_sqrt(self.__medians[index], updated_centers[index]) for index in range(len(updated_centers))]);    # Fast solution
                 
                self.__medians = updated_centers;


    def get_clusters(self):
        """!
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
        
        @see process()
        @see get_medians()
        
        """
        
        return self.__clusters;
    
    
    def get_medians(self):
        """!
        @brief Returns list of centers of allocated clusters.
        
        @see process()
        @see get_clusters()
        
        """

        return self.__medians;


    def __update_clusters(self):

        """!
        @brief Calculate Manhattan distance to each point from the each cluster. 
        @details Nearest points are captured by according clusters and as a result clusters are updated.
        
        @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
        
        """
        
        clusters = [[] for i in range(len(self.__medians))];
        for index_point in range(len(self.__pointer_data)):
            index_optim = -1;
            dist_optim = 0.0;
             
            for index in range(len(self.__medians)):
                dist = euclidean_distance_sqrt(self.__pointer_data[index_point], self.__medians[index]);
                 
                if ( (dist < dist_optim) or (index is 0)):
                    index_optim = index;
                    dist_optim = dist;
             
            clusters[index_optim].append(index_point);
            
        # If cluster is not able to capture object it should be removed
        clusters = [cluster for cluster in clusters if len(cluster) > 0];
        
        return clusters;
    
    
    def __update_medians(self):
        """!
        @brief Calculate medians of clusters in line with contained objects.
        
        @return (list) list of medians for current number of clusters.
        
        """
         
        medians = [[] for i in range(len(self.__clusters))];
         
        for index in range(len(self.__clusters)):
            medians[index] = [ 0.0 for i in range(len(self.__pointer_data[0]))];
            length_cluster = len(self.__clusters[index]);
            
            for index_dimension in range(len(self.__pointer_data[0])):
                sorted_cluster = sorted(self.__clusters[index], key = lambda x: self.__pointer_data[x][index_dimension]);
                
                relative_index_median = math.floor(length_cluster / 2);
                index_median = sorted_cluster[relative_index_median];
                
                if ( (length_cluster % 2) and (relative_index_median + 1 < len(sorted_cluster)) ):
                    index_median_second = sorted_cluster[relative_index_median + 1];
                    medians[index][index_dimension] =  (self.__pointer_data[index_median][index_dimension] + self.__pointer_data[index_median_second][index_dimension]) / 2.0;
                    
                else:
                    medians[index][index_dimension] = self.__pointer_data[index_median][index_dimension];
             
        return medians;

1		"""!
2
3		@brief Cluster analysis algorithm: K-Medians
4		@details Based on book description:
5		- A.K. Jain, R.C Dubes, Algorithms for Clustering Data. 1988.
6
7		@authors Andrei Novikov ([email protected])
8		@date 2014-2016
9		@copyright GNU Public License
10
11		@cond GNU_PUBLIC_LICENSE
12		PyClustering is free software: you can redistribute it and/or modify
13		it under the terms of the GNU General Public License as published by
14		the Free Software Foundation, either version 3 of the License, or
15		(at your option) any later version.
16
17		PyClustering is distributed in the hope that it will be useful,
18		but WITHOUT ANY WARRANTY; without even the implied warranty of
19		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20		GNU General Public License for more details.
21
22		You should have received a copy of the GNU General Public License
23		along with this program. If not, see <http://www.gnu.org/licenses/>.
24		@endcond
25
26		"""
27
28		import math;
29
30		from pyclustering.utils import euclidean_distance_sqrt;
31
32		import pyclustering.core.wrapper as wrapper;
33
34		class kmedians:
35		"""!
36		@brief Class represents clustering algorithm K-Medians.
37		@details The algorithm is less sensitive to outliers than K-Means. Medians are calculated instead of centroids.
38
39		Example:
40		@code
41		# load list of points for cluster analysis
42		sample = read_sample(path);
43
44		# create instance of K-Medians algorithm
45		kmedians_instance = kmedians(sample, [ [0.0, 0.1], [2.5, 2.6] ]);
46
47		# run cluster analysis and obtain results
48		kmedians_instance.process();
49		kmedians_instance.get_clusters();
50		@endcode
51
52		"""
53
54		def __init__(self, data, initial_centers, tolerance = 0.25, ccore = False):
55		"""!
56		@brief Constructor of clustering algorithm K-Medians.
57
58		@param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
59		@param[in] initial_centers (list): Initial coordinates of medians of clusters that are represented by list: [center1, center2, ...].
60		@param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing
61		@param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
62
63		"""
64		self.__pointer_data = data;
65		self.__clusters = [];
66		self.__medians = initial_centers[:];
67		self.__tolerance = tolerance;
68		self.__ccore = ccore;
69
70
71	View Code Duplication	def process(self):
		0 ignored issues – show Duplication introduced 2016-04-12 15:27 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
72		"""!
73		@brief Performs cluster analysis in line with rules of K-Medians algorithm.
74
75		@remark Results of clustering can be obtained using corresponding get methods.
76
77		@see get_clusters()
78		@see get_medians()
79
80		"""
81
82		if (self.__ccore is True):
83		self.__clusters = wrapper.kmedians(self.__pointer_data, self.__medians, self.__tolerance);
84		self.__medians = self.__update_medians();
85
86		else:
87		changes = float('inf');
88
89		stop_condition = self.__tolerance * self.__tolerance; # Fast solution
90		#stop_condition = self.__tolerance; # Slow solution
91
92		# Check for dimension
93		if (len(self.__pointer_data[0]) != len(self.__medians[0])):
94		raise NameError('Dimension of the input data and dimension of the initial cluster medians must be equal.');
95
96		while (changes > stop_condition):
97		self.__clusters = self.__update_clusters();
98		updated_centers = self.__update_medians(); # changes should be calculated before asignment
99
100		changes = max([euclidean_distance_sqrt(self.__medians[index], updated_centers[index]) for index in range(len(updated_centers))]); # Fast solution
101
102		self.__medians = updated_centers;
103
104
105		def get_clusters(self):
106		"""!
107		@brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
108
109		@see process()
110		@see get_medians()
111
112		"""
113
114		return self.__clusters;
115
116
117		def get_medians(self):
118		"""!
119		@brief Returns list of centers of allocated clusters.
120
121		@see process()
122		@see get_clusters()
123
124		"""
125
126		return self.__medians;
127
128
129	View Code Duplication	def __update_clusters(self):
		0 ignored issues – show Duplication introduced 2016-04-12 15:27 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
130		"""!
131		@brief Calculate Manhattan distance to each point from the each cluster.
132		@details Nearest points are captured by according clusters and as a result clusters are updated.
133
134		@return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
135
136		"""
137
138		clusters = [[] for i in range(len(self.__medians))];
139		for index_point in range(len(self.__pointer_data)):
140		index_optim = -1;
141		dist_optim = 0.0;
142
143		for index in range(len(self.__medians)):
144		dist = euclidean_distance_sqrt(self.__pointer_data[index_point], self.__medians[index]);
145
146		if ( (dist < dist_optim) or (index is 0)):
147		index_optim = index;
148		dist_optim = dist;
149
150		clusters[index_optim].append(index_point);
151
152		# If cluster is not able to capture object it should be removed
153		clusters = [cluster for cluster in clusters if len(cluster) > 0];
154
155		return clusters;
156
157
158		def __update_medians(self):
159		"""!
160		@brief Calculate medians of clusters in line with contained objects.
161
162		@return (list) list of medians for current number of clusters.
163
164		"""
165
166		medians = [[] for i in range(len(self.__clusters))];
167
168		for index in range(len(self.__clusters)):
169		medians[index] = [ 0.0 for i in range(len(self.__pointer_data[0]))];
170		length_cluster = len(self.__clusters[index]);
171
172		for index_dimension in range(len(self.__pointer_data[0])):
173		sorted_cluster = sorted(self.__clusters[index], key = lambda x: self.__pointer_data[x][index_dimension]);
174
175		relative_index_median = math.floor(length_cluster / 2);
176		index_median = sorted_cluster[relative_index_median];
177
178		if ( (length_cluster % 2) and (relative_index_median + 1 < len(sorted_cluster)) ):
179		index_median_second = sorted_cluster[relative_index_median + 1];
180		medians[index][index_dimension] = (self.__pointer_data[index_median][index_dimension] + self.__pointer_data[index_median_second][index_dimension]) / 2.0;
181
182		else:
183		medians[index][index_dimension] = self.__pointer_data[index_median][index_dimension];
184
185		return medians;

annoviko / pyclustering

Push — master ( 916c89...8edca8 )

kmedians A

Complexity

Size/Duplication

6 Methods

How to fix Duplicated Code

Duplicated Code

Duplication Side-by-Side

Filter issues like