kmedians - Code Metrics - Inspection of "Merge branch 'master' of https://github.com/annovi..." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( d2e29b...b6987f )

by Andrei

created 2017-02-21 11:58 UTC

kmedians A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	165
Duplicated Lines	35.76 %

Importance

Changes

Metric	Value
c	0
b	0
f	0
dl	59
loc	165
rs	10
wmc	24

7 Methods

Rating	Name	Duplication	Size	Complexity
A	get_cluster_encoding()	11	11	1
A	__init__()	0	15	1
D	__update_clusters()	10	27	8
A	get_clusters()	0	10	1
A	get_medians()	2	10	1
B	process()	32	32	5
C	__update_medians()	0	28	7

How to fix Duplicated Code

"""!

@brief Cluster analysis algorithm: K-Medians
@details Based on book description:
         - A.K. Jain, R.C Dubes, Algorithms for Clustering Data. 1988.

@authors Andrei Novikov ([email protected])
@date 2014-2017
@copyright GNU Public License

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""


import math;

from pyclustering.cluster.encoder import type_encoding;

from pyclustering.utils import euclidean_distance_sqrt;

import pyclustering.core.kmedians_wrapper as wrapper;


class kmedians:
    """!
    @brief Class represents clustering algorithm K-Medians.
    @details The algorithm is less sensitive to outliers than K-Means. Medians are calculated instead of centroids.
    
    Example:
    @code
        # load list of points for cluster analysis
        sample = read_sample(path);
        
        # create instance of K-Medians algorithm
        kmedians_instance = kmedians(sample, [ [0.0, 0.1], [2.5, 2.6] ]);
        
        # run cluster analysis and obtain results
        kmedians_instance.process();
        kmedians_instance.get_clusters();    
    @endcode
    
    """
    
    def __init__(self, data, initial_centers, tolerance = 0.25, ccore = False):
        """!
        @brief Constructor of clustering algorithm K-Medians.
        
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
        @param[in] initial_centers (list): Initial coordinates of medians of clusters that are represented by list: [center1, center2, ...].
        @param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing
        @param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
        
        """
        self.__pointer_data = data;
        self.__clusters = [];
        self.__medians = initial_centers[:];
        self.__tolerance = tolerance;
        self.__ccore = ccore;


    def process(self):

        """!
        @brief Performs cluster analysis in line with rules of K-Medians algorithm.
        
        @remark Results of clustering can be obtained using corresponding get methods.
        
        @see get_clusters()
        @see get_medians()
        
        """
        
        if (self.__ccore is True):
            self.__clusters = wrapper.kmedians(self.__pointer_data, self.__medians, self.__tolerance);
            self.__medians = self.__update_medians();
            
        else:
            changes = float('inf');
             
            stop_condition = self.__tolerance * self.__tolerance;   # Fast solution
            #stop_condition = self.__tolerance;              # Slow solution
             
            # Check for dimension
            if (len(self.__pointer_data[0]) != len(self.__medians[0])):
                raise NameError('Dimension of the input data and dimension of the initial cluster medians must be equal.');
             
            while (changes > stop_condition):
                self.__clusters = self.__update_clusters();
                updated_centers = self.__update_medians();  # changes should be calculated before asignment
             
                changes = max([euclidean_distance_sqrt(self.__medians[index], updated_centers[index]) for index in range(len(updated_centers))]);    # Fast solution
                 
                self.__medians = updated_centers;


    def get_clusters(self):
        """!
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
        
        @see process()
        @see get_medians()
        
        """
        
        return self.__clusters;
    
    
    def get_medians(self):
        """!
        @brief Returns list of centers of allocated clusters.
        
        @see process()
        @see get_clusters()
        
        """


        return self.__medians;


    def get_cluster_encoding(self):
        """!
        @brief Returns clustering result representation type that indicate how clusters are encoded.
        
        @return (type_encoding) Clustering result representation.
        
        @see get_clusters()
        
        """
        
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;


    def __update_clusters(self):
        """!
        @brief Calculate Manhattan distance to each point from the each cluster. 
        @details Nearest points are captured by according clusters and as a result clusters are updated.
        
        @return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
        
        """
        
        clusters = [[] for i in range(len(self.__medians))];
        for index_point in range(len(self.__pointer_data)):
            index_optim = -1;
            dist_optim = 0.0;
             
            for index in range(len(self.__medians)):
                dist = euclidean_distance_sqrt(self.__pointer_data[index_point], self.__medians[index]);
                 
                if ( (dist < dist_optim) or (index is 0)):
                    index_optim = index;
                    dist_optim = dist;
             
            clusters[index_optim].append(index_point);
            
        # If cluster is not able to capture object it should be removed
        clusters = [cluster for cluster in clusters if len(cluster) > 0];
        
        return clusters;
    
    
    def __update_medians(self):
        """!
        @brief Calculate medians of clusters in line with contained objects.
        
        @return (list) list of medians for current number of clusters.
        
        """
         
        medians = [[] for i in range(len(self.__clusters))];
         
        for index in range(len(self.__clusters)):
            medians[index] = [ 0.0 for i in range(len(self.__pointer_data[0]))];
            length_cluster = len(self.__clusters[index]);
            
            for index_dimension in range(len(self.__pointer_data[0])):
                sorted_cluster = sorted(self.__clusters[index], key = lambda x: self.__pointer_data[x][index_dimension]);
                
                relative_index_median = math.floor(length_cluster / 2);
                index_median = sorted_cluster[relative_index_median];
                
                if ( (length_cluster % 2) == 0 ):
                    index_median_second = sorted_cluster[relative_index_median + 1];
                    medians[index][index_dimension] =  (self.__pointer_data[index_median][index_dimension] + self.__pointer_data[index_median_second][index_dimension]) / 2.0;
                    
                else:
                    medians[index][index_dimension] = self.__pointer_data[index_median][index_dimension];
             
        return medians;

1		"""!
2
3		@brief Cluster analysis algorithm: K-Medians
4		@details Based on book description:
5		- A.K. Jain, R.C Dubes, Algorithms for Clustering Data. 1988.
6
7		@authors Andrei Novikov ([email protected])
8		@date 2014-2017
9		@copyright GNU Public License
10
11		@cond GNU_PUBLIC_LICENSE
12		PyClustering is free software: you can redistribute it and/or modify
13		it under the terms of the GNU General Public License as published by
14		the Free Software Foundation, either version 3 of the License, or
15		(at your option) any later version.
16
17		PyClustering is distributed in the hope that it will be useful,
18		but WITHOUT ANY WARRANTY; without even the implied warranty of
19		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20		GNU General Public License for more details.
21
22		You should have received a copy of the GNU General Public License
23		along with this program. If not, see <http://www.gnu.org/licenses/>.
24		@endcond
25
26		"""
27
28
29		import math;
30
31		from pyclustering.cluster.encoder import type_encoding;
32
33		from pyclustering.utils import euclidean_distance_sqrt;
34
35		import pyclustering.core.kmedians_wrapper as wrapper;
36
37
38		class kmedians:
39		"""!
40		@brief Class represents clustering algorithm K-Medians.
41		@details The algorithm is less sensitive to outliers than K-Means. Medians are calculated instead of centroids.
42
43		Example:
44		@code
45		# load list of points for cluster analysis
46		sample = read_sample(path);
47
48		# create instance of K-Medians algorithm
49		kmedians_instance = kmedians(sample, [ [0.0, 0.1], [2.5, 2.6] ]);
50
51		# run cluster analysis and obtain results
52		kmedians_instance.process();
53		kmedians_instance.get_clusters();
54		@endcode
55
56		"""
57
58		def __init__(self, data, initial_centers, tolerance = 0.25, ccore = False):
59		"""!
60		@brief Constructor of clustering algorithm K-Medians.
61
62		@param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
63		@param[in] initial_centers (list): Initial coordinates of medians of clusters that are represented by list: [center1, center2, ...].
64		@param[in] tolerance (double): Stop condition: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing
65		@param[in] ccore (bool): Defines should be CCORE library (C++ pyclustering library) used instead of Python code or not.
66
67		"""
68		self.__pointer_data = data;
69		self.__clusters = [];
70		self.__medians = initial_centers[:];
71		self.__tolerance = tolerance;
72		self.__ccore = ccore;
73
74
75	View Code Duplication	def process(self):
		0 ignored issues – show Duplication introduced 2016-04-28 20:11 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
76		"""!
77		@brief Performs cluster analysis in line with rules of K-Medians algorithm.
78
79		@remark Results of clustering can be obtained using corresponding get methods.
80
81		@see get_clusters()
82		@see get_medians()
83
84		"""
85
86		if (self.__ccore is True):
87		self.__clusters = wrapper.kmedians(self.__pointer_data, self.__medians, self.__tolerance);
88		self.__medians = self.__update_medians();
89
90		else:
91		changes = float('inf');
92
93		stop_condition = self.__tolerance * self.__tolerance; # Fast solution
94		#stop_condition = self.__tolerance; # Slow solution
95
96		# Check for dimension
97		if (len(self.__pointer_data[0]) != len(self.__medians[0])):
98		raise NameError('Dimension of the input data and dimension of the initial cluster medians must be equal.');
99
100		while (changes > stop_condition):
101		self.__clusters = self.__update_clusters();
102		updated_centers = self.__update_medians(); # changes should be calculated before asignment
103
104		changes = max([euclidean_distance_sqrt(self.__medians[index], updated_centers[index]) for index in range(len(updated_centers))]); # Fast solution
105
106		self.__medians = updated_centers;
107
108
109		def get_clusters(self):
110		"""!
111		@brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
112
113		@see process()
114		@see get_medians()
115
116		"""
117
118		return self.__clusters;
119
120
121		def get_medians(self):
122		"""!
123		@brief Returns list of centers of allocated clusters.
124
125		@see process()
126		@see get_clusters()
127
128		"""
129	View Code Duplication
		0 ignored issues – show Duplication introduced 2016-04-28 20:11 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
130		return self.__medians;
131
132
133		def get_cluster_encoding(self):
134		"""!
135		@brief Returns clustering result representation type that indicate how clusters are encoded.
136
137		@return (type_encoding) Clustering result representation.
138
139		@see get_clusters()
140
141		"""
142
143		return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
144
145
146		def __update_clusters(self):
147		"""!
148		@brief Calculate Manhattan distance to each point from the each cluster.
149		@details Nearest points are captured by according clusters and as a result clusters are updated.
150
151		@return (list) updated clusters as list of clusters where each cluster contains indexes of objects from data.
152
153		"""
154
155		clusters = [[] for i in range(len(self.__medians))];
156		for index_point in range(len(self.__pointer_data)):
157		index_optim = -1;
158		dist_optim = 0.0;
159
160		for index in range(len(self.__medians)):
161		dist = euclidean_distance_sqrt(self.__pointer_data[index_point], self.__medians[index]);
162
163		if ( (dist < dist_optim) or (index is 0)):
164		index_optim = index;
165		dist_optim = dist;
166
167		clusters[index_optim].append(index_point);
168
169		# If cluster is not able to capture object it should be removed
170		clusters = [cluster for cluster in clusters if len(cluster) > 0];
171
172		return clusters;
173
174
175		def __update_medians(self):
176		"""!
177		@brief Calculate medians of clusters in line with contained objects.
178
179		@return (list) list of medians for current number of clusters.
180
181		"""
182
183		medians = [[] for i in range(len(self.__clusters))];
184
185		for index in range(len(self.__clusters)):
186		medians[index] = [ 0.0 for i in range(len(self.__pointer_data[0]))];
187		length_cluster = len(self.__clusters[index]);
188
189		for index_dimension in range(len(self.__pointer_data[0])):
190		sorted_cluster = sorted(self.__clusters[index], key = lambda x: self.__pointer_data[x][index_dimension]);
191
192		relative_index_median = math.floor(length_cluster / 2);
193		index_median = sorted_cluster[relative_index_median];
194
195		if ( (length_cluster % 2) == 0 ):
196		index_median_second = sorted_cluster[relative_index_median + 1];
197		medians[index][index_dimension] = (self.__pointer_data[index_median][index_dimension] + self.__pointer_data[index_median_second][index_dimension]) / 2.0;
198
199		else:
200		medians[index][index_dimension] = self.__pointer_data[index_median][index_dimension];
201
202		return medians;

annoviko / pyclustering

Push — master ( d2e29b...b6987f )

kmedians A

Complexity

Size/Duplication

Importance

7 Methods

How to fix Duplicated Code

Duplicated Code

Duplication Side-by-Side

Filter issues like