agglomerative - Code Metrics - Inspection of "[pyclustering] Refactoring." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 1d669d...037fe6 )

by Andrei

created 2016-05-23 17:00 UTC

agglomerative B

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	256
Duplicated Lines	14.84 %

Metric	Value
dl	38
loc	256
rs	8.3157
wmc	43

11 Methods

Rating	Name	Duplication	Size	Complexity
B	__merge_by_average_link()	0	25	6
A	__merge_by_centroid_link()	0	21	4
A	get_clusters()	0	13	1
A	__init__()	0	23	3
B	__merge_similar_clusters()	0	20	5
A	process()	0	19	4
A	__calculate_nearest_distance()	0	19	4
A	__merge_by_signle_link()	19	19	4
A	__calculate_farthest_distance()	0	19	4
A	__merge_by_complete_link()	19	19	4
A	__calculate_center()	0	18	4

How to fix Duplicated Code Complexity

"""!

@brief Cluster analysis algorithm: agglomerative algorithm.
@details Implementation based on book:
         - K.Anil, J.C.Dubes, R.C.Dubes. Algorithms for Clustering Data. 1988.

@authors Andrei Novikov ([email protected])
@date 2014-2016
@copyright GNU Public License

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""

from enum import IntEnum;

from pyclustering.utils import euclidean_distance_sqrt;

import pyclustering.core.agglomerative_wrapper as wrapper;


class type_link(IntEnum):
    """!
    @brief Enumerator of types of link between clusters.
    
    """

    ## Nearest objects in clusters is considered as a link.
    SINGLE_LINK = 0;
    
    ## Farthest objects in clusters is considered as a link.
    COMPLETE_LINK = 1;
    
    ## Average distance between objects in clusters is considered as a link.
    AVERAGE_LINK = 2;
    
    ## Distance between centers of clusters is considered as a link.
    CENTROID_LINK = 3;


class agglomerative:
    """!
    @brief Class represents agglomerative algorithm for cluster analysis.
    
    Example:
    @code
        # sample for cluster analysis (represented by list)
        sample = read_sample(path_to_sample);
        
        # create object that uses python code only
        agglomerative_instance = agglomerative(sample, 2, link_type.CENTROID_LINK)
        
        # cluster analysis
        agglomerative_instance.process();
        
        # obtain results of clustering
        clusters = agglomerative_instance.get_clusters();  
    @endcode
    
    """
    
    def __init__(self, data, number_clusters, link = None, ccore = False):
        """!
        @brief Constructor of agglomerative hierarchical algorithm.
        
        @param[in] data (list): Input data that is presented as a list of points (objects), each point should be represented by a list or tuple.
        @param[in] number_clusters (uint): Number of clusters that should be allocated.
        @param[in] link (type_link): Link type that is used for calculation similarity between objects and clusters, if it is not specified centroid link will be used by default.
        @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
        
        """  
        
        self.__pointer_data = data;
        self.__number_clusters = number_clusters;
        self.__similarity = link;
        
        if (self.__similarity is None):
            self.__similarity = type_link.CENTROID_LINK;
        
        self.__clusters = [];
        self.__ccore = ccore;
        
        if (self.__similarity == type_link.CENTROID_LINK):
            self.__centers = self.__pointer_data.copy();    # used in case of usage of centroid links
    
    
    def process(self):
        """!
        @brief Performs cluster analysis in line with rules of agglomerative algorithm and similarity.
        
        @see get_clusters()
        
        """
        
        if (self.__ccore is True):
            self.__clusters = wrapper.agglomerative_algorithm(self.__pointer_data, self.__number_clusters, self.__similarity);

        else:
            self.__clusters = [[index] for index in range(0, len(self.__pointer_data))];
            
            current_number_clusters = len(self.__clusters);
                
            while (current_number_clusters > self.__number_clusters):
                self.__merge_similar_clusters();
                current_number_clusters = len(self.__clusters);
    
    
    def get_clusters(self):
        """!
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
        
        @remark Results of clustering can be obtained using corresponding gets methods.
        
        @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
        
        @see process()
        
        """
        
        return self.__clusters;
    
    
    def __merge_similar_clusters(self):
        """!
        @brief Merges the most similar clusters in line with link type.
        
        """
        
        if (self.__similarity == type_link.AVERAGE_LINK):
            self.__merge_by_average_link();
        
        elif (self.__similarity == type_link.CENTROID_LINK):
            self.__merge_by_centroid_link();
        
        elif (self.__similarity == type_link.COMPLETE_LINK):
            self.__merge_by_complete_link();
        
        elif (self.__similarity == type_link.SINGLE_LINK):
            self.__merge_by_signle_link();
        
        else:
            raise NameError('Not supported similarity is used');
    
    
    def __merge_by_average_link(self):
        """!
        @brief Merges the most similar clusters in line with average link type.
        
        """
        
        minimum_average_distance = float('Inf');
        
        for index_cluster1 in range(0, len(self.__clusters)):
            for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
                
                # Find farthest objects
                candidate_average_distance = 0.0;
                for index_object1 in self.__clusters[index_cluster1]:
                    for index_object2 in self.__clusters[index_cluster2]:
                        candidate_average_distance += euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
                
                candidate_average_distance /= (len(self.__clusters[index_cluster1]) + len(self.__clusters[index_cluster2]));
                
                if (candidate_average_distance < minimum_average_distance):
                    minimum_average_distance = candidate_average_distance;
                    indexes = [index_cluster1, index_cluster2];
        
        self.__clusters[indexes[0]] += self.__clusters[indexes[1]];  
        self.__clusters.pop(indexes[1]);   # remove merged cluster.  
        
    
    def __merge_by_centroid_link(self):
        """!
        @brief Merges the most similar clusters in line with centroid link type.
        
        """
        
        minimum_centroid_distance = float('Inf');
        indexes = None;
        
        for index1 in range(0, len(self.__centers)):
            for index2 in range(index1 + 1, len(self.__centers)):
                distance = euclidean_distance_sqrt(self.__centers[index1], self.__centers[index2]);
                if (distance < minimum_centroid_distance):
                    minimum_centroid_distance = distance;
                    indexes = [index1, index2];
        
        self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
        self.__centers[indexes[0]] = self.__calculate_center(self.__clusters[indexes[0]]);
         
        self.__clusters.pop(indexes[1]);   # remove merged cluster.
        self.__centers.pop(indexes[1]);    # remove merged center.
    
    
    def __merge_by_complete_link(self):

        """!
        @brief Merges the most similar clusters in line with complete link type.
        
        """
        
        minimum_complete_distance = float('Inf');
        indexes = None;
        
        for index_cluster1 in range(0, len(self.__clusters)):
            for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
                candidate_maximum_distance = self.__calculate_farthest_distance(index_cluster1, index_cluster2);
                
                if (candidate_maximum_distance < minimum_complete_distance):
                    minimum_complete_distance = candidate_maximum_distance;
                    indexes = [index_cluster1, index_cluster2];

        self.__clusters[indexes[0]] += self.__clusters[indexes[1]];  
        self.__clusters.pop(indexes[1]);   # remove merged cluster.        
    
    
    def __calculate_farthest_distance(self, index_cluster1, index_cluster2):
        """!
        @brief Finds two farthest objects in two specified clusters in terms and returns distance between them.
        
        @param[in] (uint) Index of the first cluster.
        @param[in] (uint) Index of the second cluster.
        
        @return The farthest euclidean distance between two clusters.
        
        """
        candidate_maximum_distance = 0.0;
        for index_object1 in self.__clusters[index_cluster1]:
            for index_object2 in self.__clusters[index_cluster2]:
                distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
                
                if (distance > candidate_maximum_distance):
                    candidate_maximum_distance = distance;
    
        return candidate_maximum_distance;
    
    
    def __merge_by_signle_link(self):

        """!
        @brief Merges the most similar clusters in line with single link type.
        
        """
        
        minimum_single_distance = float('Inf');
        indexes = None;
        
        for index_cluster1 in range(0, len(self.__clusters)):
            for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
                candidate_minimum_distance = self.__calculate_nearest_distance(index_cluster1, index_cluster2);
                
                if (candidate_minimum_distance < minimum_single_distance):
                    minimum_single_distance = candidate_minimum_distance;
                    indexes = [index_cluster1, index_cluster2];

        self.__clusters[indexes[0]] += self.__clusters[indexes[1]];  
        self.__clusters.pop(indexes[1]);   # remove merged cluster.
    
    
    def __calculate_nearest_distance(self, index_cluster1, index_cluster2):
        """!
        @brief Finds two nearest objects in two specified clusters and returns distance between them.
        
        @param[in] (uint) Index of the first cluster.
        @param[in] (uint) Index of the second cluster.
        
        @return The nearest euclidean distance between two clusters.
        
        """
        candidate_minimum_distance = float('Inf');
        
        for index_object1 in self.__clusters[index_cluster1]:
            for index_object2 in self.__clusters[index_cluster2]:
                distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
                if (distance < candidate_minimum_distance):
                    candidate_minimum_distance = distance;
        
        return candidate_minimum_distance;
    
    
    def __calculate_center(self, cluster):
        """!
        @brief Calculates new center.
        
        @return (list) New value of the center of the specified cluster.
        
        """
         
        dimension = len(self.__pointer_data[cluster[0]]);
        center = [0] * dimension;
        for index_point in cluster:
            for index_dimension in range(0, dimension):
                center[index_dimension] += self.__pointer_data[index_point][index_dimension];
         
        for index_dimension in range(0, dimension):
            center[index_dimension] /= len(cluster);
             
        return center;

1		"""!
2
3		@brief Cluster analysis algorithm: agglomerative algorithm.
4		@details Implementation based on book:
5		- K.Anil, J.C.Dubes, R.C.Dubes. Algorithms for Clustering Data. 1988.
6
7		@authors Andrei Novikov ([email protected])
8		@date 2014-2016
9		@copyright GNU Public License
10
11		@cond GNU_PUBLIC_LICENSE
12		PyClustering is free software: you can redistribute it and/or modify
13		it under the terms of the GNU General Public License as published by
14		the Free Software Foundation, either version 3 of the License, or
15		(at your option) any later version.
16
17		PyClustering is distributed in the hope that it will be useful,
18		but WITHOUT ANY WARRANTY; without even the implied warranty of
19		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20		GNU General Public License for more details.
21
22		You should have received a copy of the GNU General Public License
23		along with this program. If not, see <http://www.gnu.org/licenses/>.
24		@endcond
25
26		"""
27
28		from enum import IntEnum;
29
30		from pyclustering.utils import euclidean_distance_sqrt;
31
32		import pyclustering.core.agglomerative_wrapper as wrapper;
33
34
35		class type_link(IntEnum):
36		"""!
37		@brief Enumerator of types of link between clusters.
38
39		"""
40
41		## Nearest objects in clusters is considered as a link.
42		SINGLE_LINK = 0;
43
44		## Farthest objects in clusters is considered as a link.
45		COMPLETE_LINK = 1;
46
47		## Average distance between objects in clusters is considered as a link.
48		AVERAGE_LINK = 2;
49
50		## Distance between centers of clusters is considered as a link.
51		CENTROID_LINK = 3;
52
53
54		class agglomerative:
55		"""!
56		@brief Class represents agglomerative algorithm for cluster analysis.
57
58		Example:
59		@code
60		# sample for cluster analysis (represented by list)
61		sample = read_sample(path_to_sample);
62
63		# create object that uses python code only
64		agglomerative_instance = agglomerative(sample, 2, link_type.CENTROID_LINK)
65
66		# cluster analysis
67		agglomerative_instance.process();
68
69		# obtain results of clustering
70		clusters = agglomerative_instance.get_clusters();
71		@endcode
72
73		"""
74
75		def __init__(self, data, number_clusters, link = None, ccore = False):
76		"""!
77		@brief Constructor of agglomerative hierarchical algorithm.
78
79		@param[in] data (list): Input data that is presented as a list of points (objects), each point should be represented by a list or tuple.
80		@param[in] number_clusters (uint): Number of clusters that should be allocated.
81		@param[in] link (type_link): Link type that is used for calculation similarity between objects and clusters, if it is not specified centroid link will be used by default.
82		@param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
83
84		"""
85
86		self.__pointer_data = data;
87		self.__number_clusters = number_clusters;
88		self.__similarity = link;
89
90		if (self.__similarity is None):
91		self.__similarity = type_link.CENTROID_LINK;
92
93		self.__clusters = [];
94		self.__ccore = ccore;
95
96		if (self.__similarity == type_link.CENTROID_LINK):
97		self.__centers = self.__pointer_data.copy(); # used in case of usage of centroid links
98
99
100		def process(self):
101		"""!
102		@brief Performs cluster analysis in line with rules of agglomerative algorithm and similarity.
103
104		@see get_clusters()
105
106		"""
107
108		if (self.__ccore is True):
109		self.__clusters = wrapper.agglomerative_algorithm(self.__pointer_data, self.__number_clusters, self.__similarity);
110
111		else:
112		self.__clusters = [[index] for index in range(0, len(self.__pointer_data))];
113
114		current_number_clusters = len(self.__clusters);
115
116		while (current_number_clusters > self.__number_clusters):
117		self.__merge_similar_clusters();
118		current_number_clusters = len(self.__clusters);
119
120
121		def get_clusters(self):
122		"""!
123		@brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
124
125		@remark Results of clustering can be obtained using corresponding gets methods.
126
127		@return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
128
129		@see process()
130
131		"""
132
133		return self.__clusters;
134
135
136		def __merge_similar_clusters(self):
137		"""!
138		@brief Merges the most similar clusters in line with link type.
139
140		"""
141
142		if (self.__similarity == type_link.AVERAGE_LINK):
143		self.__merge_by_average_link();
144
145		elif (self.__similarity == type_link.CENTROID_LINK):
146		self.__merge_by_centroid_link();
147
148		elif (self.__similarity == type_link.COMPLETE_LINK):
149		self.__merge_by_complete_link();
150
151		elif (self.__similarity == type_link.SINGLE_LINK):
152		self.__merge_by_signle_link();
153
154		else:
155		raise NameError('Not supported similarity is used');
156
157
158		def __merge_by_average_link(self):
159		"""!
160		@brief Merges the most similar clusters in line with average link type.
161
162		"""
163
164		minimum_average_distance = float('Inf');
165
166		for index_cluster1 in range(0, len(self.__clusters)):
167		for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
168
169		# Find farthest objects
170		candidate_average_distance = 0.0;
171		for index_object1 in self.__clusters[index_cluster1]:
172		for index_object2 in self.__clusters[index_cluster2]:
173		candidate_average_distance += euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
174
175		candidate_average_distance /= (len(self.__clusters[index_cluster1]) + len(self.__clusters[index_cluster2]));
176
177		if (candidate_average_distance < minimum_average_distance):
178		minimum_average_distance = candidate_average_distance;
179		indexes = [index_cluster1, index_cluster2];
180
181		self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
182		self.__clusters.pop(indexes[1]); # remove merged cluster.
183
184
185		def __merge_by_centroid_link(self):
186		"""!
187		@brief Merges the most similar clusters in line with centroid link type.
188
189		"""
190
191		minimum_centroid_distance = float('Inf');
192		indexes = None;
193
194		for index1 in range(0, len(self.__centers)):
195		for index2 in range(index1 + 1, len(self.__centers)):
196		distance = euclidean_distance_sqrt(self.__centers[index1], self.__centers[index2]);
197		if (distance < minimum_centroid_distance):
198		minimum_centroid_distance = distance;
199		indexes = [index1, index2];
200
201		self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
202		self.__centers[indexes[0]] = self.__calculate_center(self.__clusters[indexes[0]]);
203
204		self.__clusters.pop(indexes[1]); # remove merged cluster.
205		self.__centers.pop(indexes[1]); # remove merged center.
206
207
208	View Code Duplication	def __merge_by_complete_link(self):
		0 ignored issues – show Duplication introduced 2016-04-28 20:11 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
209		"""!
210		@brief Merges the most similar clusters in line with complete link type.
211
212		"""
213
214		minimum_complete_distance = float('Inf');
215		indexes = None;
216
217		for index_cluster1 in range(0, len(self.__clusters)):
218		for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
219		candidate_maximum_distance = self.__calculate_farthest_distance(index_cluster1, index_cluster2);
220
221		if (candidate_maximum_distance < minimum_complete_distance):
222		minimum_complete_distance = candidate_maximum_distance;
223		indexes = [index_cluster1, index_cluster2];
224
225		self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
226		self.__clusters.pop(indexes[1]); # remove merged cluster.
227
228
229		def __calculate_farthest_distance(self, index_cluster1, index_cluster2):
230		"""!
231		@brief Finds two farthest objects in two specified clusters in terms and returns distance between them.
232
233		@param[in] (uint) Index of the first cluster.
234		@param[in] (uint) Index of the second cluster.
235
236		@return The farthest euclidean distance between two clusters.
237
238		"""
239		candidate_maximum_distance = 0.0;
240		for index_object1 in self.__clusters[index_cluster1]:
241		for index_object2 in self.__clusters[index_cluster2]:
242		distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
243
244		if (distance > candidate_maximum_distance):
245		candidate_maximum_distance = distance;
246
247		return candidate_maximum_distance;
248
249
250	View Code Duplication	def __merge_by_signle_link(self):
		0 ignored issues – show Duplication introduced 2016-04-28 20:11 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
251		"""!
252		@brief Merges the most similar clusters in line with single link type.
253
254		"""
255
256		minimum_single_distance = float('Inf');
257		indexes = None;
258
259		for index_cluster1 in range(0, len(self.__clusters)):
260		for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
261		candidate_minimum_distance = self.__calculate_nearest_distance(index_cluster1, index_cluster2);
262
263		if (candidate_minimum_distance < minimum_single_distance):
264		minimum_single_distance = candidate_minimum_distance;
265		indexes = [index_cluster1, index_cluster2];
266
267		self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
268		self.__clusters.pop(indexes[1]); # remove merged cluster.
269
270
271		def __calculate_nearest_distance(self, index_cluster1, index_cluster2):
272		"""!
273		@brief Finds two nearest objects in two specified clusters and returns distance between them.
274
275		@param[in] (uint) Index of the first cluster.
276		@param[in] (uint) Index of the second cluster.
277
278		@return The nearest euclidean distance between two clusters.
279
280		"""
281		candidate_minimum_distance = float('Inf');
282
283		for index_object1 in self.__clusters[index_cluster1]:
284		for index_object2 in self.__clusters[index_cluster2]:
285		distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
286		if (distance < candidate_minimum_distance):
287		candidate_minimum_distance = distance;
288
289		return candidate_minimum_distance;
290
291
292		def __calculate_center(self, cluster):
293		"""!
294		@brief Calculates new center.
295
296		@return (list) New value of the center of the specified cluster.
297
298		"""
299
300		dimension = len(self.__pointer_data[cluster[0]]);
301		center = [0] * dimension;
302		for index_point in cluster:
303		for index_dimension in range(0, dimension):
304		center[index_dimension] += self.__pointer_data[index_point][index_dimension];
305
306		for index_dimension in range(0, dimension):
307		center[index_dimension] /= len(cluster);
308
309		return center;

annoviko / pyclustering

Push — master ( 1d669d...037fe6 )

agglomerative B

Complexity

Size/Duplication

11 Methods

How to fix Duplicated Code Complexity

Duplicated Code

Complex Class

Duplication Side-by-Side

Filter issues like