agglomerative - Code Metrics - Inspection of "[pyclustering.cluster] Clustering result encoder,..." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 6ed467...f3f897 )

by Andrei

created 2016-12-23 11:53 UTC

agglomerative B

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	269
Duplicated Lines	14.13 %

Importance

Changes

Metric	Value
dl	38
loc	269
rs	8.3396
c	0
b	0
f	0
wmc	44

12 Methods

Rating	Name	Duplication	Size	Complexity
B	__merge_by_average_link()	0	25	6
A	__merge_by_centroid_link()	14	21	4
A	__calculate_nearest_distance()	0	19	4
A	__init__()	0	23	3
A	get_clusters()	0	13	1
A	__merge_by_signle_link()	3	19	4
A	__calculate_farthest_distance()	14	19	4
A	__merge_by_complete_link()	3	19	4
B	__merge_similar_clusters()	0	20	5
A	__calculate_center()	0	18	4
A	get_cluster_encoding()	0	11	1
A	process()	0	19	4

How to fix Duplicated Code Complexity

"""!

@brief Cluster analysis algorithm: agglomerative algorithm.
@details Implementation based on book:
         - K.Anil, J.C.Dubes, R.C.Dubes. Algorithms for Clustering Data. 1988.

@authors Andrei Novikov ([email protected])
@date 2014-2016
@copyright GNU Public License

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""


from enum import IntEnum;

from pyclustering.cluster.encoder import type_encoding;

from pyclustering.utils import euclidean_distance_sqrt;

import pyclustering.core.agglomerative_wrapper as wrapper;


class type_link(IntEnum):
    """!
    @brief Enumerator of types of link between clusters.
    
    """

    ## Nearest objects in clusters is considered as a link.
    SINGLE_LINK = 0;
    
    ## Farthest objects in clusters is considered as a link.
    COMPLETE_LINK = 1;
    
    ## Average distance between objects in clusters is considered as a link.
    AVERAGE_LINK = 2;
    
    ## Distance between centers of clusters is considered as a link.
    CENTROID_LINK = 3;


class agglomerative:
    """!
    @brief Class represents agglomerative algorithm for cluster analysis.
    
    Example:
    @code
        # sample for cluster analysis (represented by list)
        sample = read_sample(path_to_sample);
        
        # create object that uses python code only
        agglomerative_instance = agglomerative(sample, 2, link_type.CENTROID_LINK)
        
        # cluster analysis
        agglomerative_instance.process();
        
        # obtain results of clustering
        clusters = agglomerative_instance.get_clusters();  
    @endcode
    
    """
    
    def __init__(self, data, number_clusters, link = None, ccore = False):
        """!
        @brief Constructor of agglomerative hierarchical algorithm.
        
        @param[in] data (list): Input data that is presented as a list of points (objects), each point should be represented by a list or tuple.
        @param[in] number_clusters (uint): Number of clusters that should be allocated.
        @param[in] link (type_link): Link type that is used for calculation similarity between objects and clusters, if it is not specified centroid link will be used by default.
        @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
        
        """  
        
        self.__pointer_data = data;
        self.__number_clusters = number_clusters;
        self.__similarity = link;
        
        if (self.__similarity is None):
            self.__similarity = type_link.CENTROID_LINK;
        
        self.__clusters = [];
        self.__ccore = ccore;
        
        if (self.__similarity == type_link.CENTROID_LINK):
            self.__centers = self.__pointer_data.copy();    # used in case of usage of centroid links
    
    
    def process(self):
        """!
        @brief Performs cluster analysis in line with rules of agglomerative algorithm and similarity.
        
        @see get_clusters()
        
        """
        
        if (self.__ccore is True):
            self.__clusters = wrapper.agglomerative_algorithm(self.__pointer_data, self.__number_clusters, self.__similarity);

        else:
            self.__clusters = [[index] for index in range(0, len(self.__pointer_data))];
            
            current_number_clusters = len(self.__clusters);
                
            while (current_number_clusters > self.__number_clusters):
                self.__merge_similar_clusters();
                current_number_clusters = len(self.__clusters);
    
    
    def get_clusters(self):
        """!
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
        
        @remark Results of clustering can be obtained using corresponding gets methods.
        
        @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
        
        @see process()
        
        """
        
        return self.__clusters;
    
    
    def get_cluster_encoding(self):
        """!
        @brief Returns clustering result representation type that indicate how clusters are encoded.
        
        @return (type_encoding) Clustering result representation.
        
        @see get_clusters()
        
        """
        
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
    
    
    def __merge_similar_clusters(self):
        """!
        @brief Merges the most similar clusters in line with link type.
        
        """
        
        if (self.__similarity == type_link.AVERAGE_LINK):
            self.__merge_by_average_link();
        
        elif (self.__similarity == type_link.CENTROID_LINK):
            self.__merge_by_centroid_link();
        
        elif (self.__similarity == type_link.COMPLETE_LINK):
            self.__merge_by_complete_link();
        
        elif (self.__similarity == type_link.SINGLE_LINK):
            self.__merge_by_signle_link();
        
        else:
            raise NameError('Not supported similarity is used');
    
    
    def __merge_by_average_link(self):
        """!
        @brief Merges the most similar clusters in line with average link type.
        
        """
        
        minimum_average_distance = float('Inf');
        
        for index_cluster1 in range(0, len(self.__clusters)):
            for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
                
                # Find farthest objects
                candidate_average_distance = 0.0;
                for index_object1 in self.__clusters[index_cluster1]:
                    for index_object2 in self.__clusters[index_cluster2]:
                        candidate_average_distance += euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
                
                candidate_average_distance /= (len(self.__clusters[index_cluster1]) + len(self.__clusters[index_cluster2]));
                
                if (candidate_average_distance < minimum_average_distance):
                    minimum_average_distance = candidate_average_distance;
                    indexes = [index_cluster1, index_cluster2];
        
        self.__clusters[indexes[0]] += self.__clusters[indexes[1]];  
        self.__clusters.pop(indexes[1]);   # remove merged cluster.  
        
    
    def __merge_by_centroid_link(self):
        """!
        @brief Merges the most similar clusters in line with centroid link type.
        
        """
        
        minimum_centroid_distance = float('Inf');
        indexes = None;

        
        for index1 in range(0, len(self.__centers)):
            for index2 in range(index1 + 1, len(self.__centers)):
                distance = euclidean_distance_sqrt(self.__centers[index1], self.__centers[index2]);
                if (distance < minimum_centroid_distance):
                    minimum_centroid_distance = distance;
                    indexes = [index1, index2];
        
        self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
        self.__centers[indexes[0]] = self.__calculate_center(self.__clusters[indexes[0]]);
         
        self.__clusters.pop(indexes[1]);   # remove merged cluster.
        self.__centers.pop(indexes[1]);    # remove merged center.
    
    
    def __merge_by_complete_link(self):
        """!
        @brief Merges the most similar clusters in line with complete link type.
        
        """
        
        minimum_complete_distance = float('Inf');
        indexes = None;
        
        for index_cluster1 in range(0, len(self.__clusters)):
            for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
                candidate_maximum_distance = self.__calculate_farthest_distance(index_cluster1, index_cluster2);
                
                if (candidate_maximum_distance < minimum_complete_distance):
                    minimum_complete_distance = candidate_maximum_distance;
                    indexes = [index_cluster1, index_cluster2];

        self.__clusters[indexes[0]] += self.__clusters[indexes[1]];  
        self.__clusters.pop(indexes[1]);   # remove merged cluster.        
    
    
    def __calculate_farthest_distance(self, index_cluster1, index_cluster2):
        """!
        @brief Finds two farthest objects in two specified clusters in terms and returns distance between them.
        
        @param[in] (uint) Index of the first cluster.
        @param[in] (uint) Index of the second cluster.

        
        @return The farthest euclidean distance between two clusters.
        
        """
        candidate_maximum_distance = 0.0;
        for index_object1 in self.__clusters[index_cluster1]:
            for index_object2 in self.__clusters[index_cluster2]:
                distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
                
                if (distance > candidate_maximum_distance):
                    candidate_maximum_distance = distance;
    
        return candidate_maximum_distance;
    
    
    def __merge_by_signle_link(self):
        """!
        @brief Merges the most similar clusters in line with single link type.
        
        """
        
        minimum_single_distance = float('Inf');
        indexes = None;
        
        for index_cluster1 in range(0, len(self.__clusters)):
            for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
                candidate_minimum_distance = self.__calculate_nearest_distance(index_cluster1, index_cluster2);
                
                if (candidate_minimum_distance < minimum_single_distance):
                    minimum_single_distance = candidate_minimum_distance;
                    indexes = [index_cluster1, index_cluster2];

        self.__clusters[indexes[0]] += self.__clusters[indexes[1]];  
        self.__clusters.pop(indexes[1]);   # remove merged cluster.
    
    
    def __calculate_nearest_distance(self, index_cluster1, index_cluster2):
        """!
        @brief Finds two nearest objects in two specified clusters and returns distance between them.
        
        @param[in] (uint) Index of the first cluster.
        @param[in] (uint) Index of the second cluster.
        
        @return The nearest euclidean distance between two clusters.
        
        """
        candidate_minimum_distance = float('Inf');
        
        for index_object1 in self.__clusters[index_cluster1]:
            for index_object2 in self.__clusters[index_cluster2]:
                distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
                if (distance < candidate_minimum_distance):
                    candidate_minimum_distance = distance;
        
        return candidate_minimum_distance;
    
    
    def __calculate_center(self, cluster):
        """!
        @brief Calculates new center.
        
        @return (list) New value of the center of the specified cluster.
        
        """
         
        dimension = len(self.__pointer_data[cluster[0]]);
        center = [0] * dimension;
        for index_point in cluster:
            for index_dimension in range(0, dimension):
                center[index_dimension] += self.__pointer_data[index_point][index_dimension];
         
        for index_dimension in range(0, dimension):
            center[index_dimension] /= len(cluster);
             
        return center;

1		"""!
2
3		@brief Cluster analysis algorithm: agglomerative algorithm.
4		@details Implementation based on book:
5		- K.Anil, J.C.Dubes, R.C.Dubes. Algorithms for Clustering Data. 1988.
6
7		@authors Andrei Novikov ([email protected])
8		@date 2014-2016
9		@copyright GNU Public License
10
11		@cond GNU_PUBLIC_LICENSE
12		PyClustering is free software: you can redistribute it and/or modify
13		it under the terms of the GNU General Public License as published by
14		the Free Software Foundation, either version 3 of the License, or
15		(at your option) any later version.
16
17		PyClustering is distributed in the hope that it will be useful,
18		but WITHOUT ANY WARRANTY; without even the implied warranty of
19		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20		GNU General Public License for more details.
21
22		You should have received a copy of the GNU General Public License
23		along with this program. If not, see <http://www.gnu.org/licenses/>.
24		@endcond
25
26		"""
27
28
29		from enum import IntEnum;
30
31		from pyclustering.cluster.encoder import type_encoding;
32
33		from pyclustering.utils import euclidean_distance_sqrt;
34
35		import pyclustering.core.agglomerative_wrapper as wrapper;
36
37
38		class type_link(IntEnum):
39		"""!
40		@brief Enumerator of types of link between clusters.
41
42		"""
43
44		## Nearest objects in clusters is considered as a link.
45		SINGLE_LINK = 0;
46
47		## Farthest objects in clusters is considered as a link.
48		COMPLETE_LINK = 1;
49
50		## Average distance between objects in clusters is considered as a link.
51		AVERAGE_LINK = 2;
52
53		## Distance between centers of clusters is considered as a link.
54		CENTROID_LINK = 3;
55
56
57		class agglomerative:
58		"""!
59		@brief Class represents agglomerative algorithm for cluster analysis.
60
61		Example:
62		@code
63		# sample for cluster analysis (represented by list)
64		sample = read_sample(path_to_sample);
65
66		# create object that uses python code only
67		agglomerative_instance = agglomerative(sample, 2, link_type.CENTROID_LINK)
68
69		# cluster analysis
70		agglomerative_instance.process();
71
72		# obtain results of clustering
73		clusters = agglomerative_instance.get_clusters();
74		@endcode
75
76		"""
77
78		def __init__(self, data, number_clusters, link = None, ccore = False):
79		"""!
80		@brief Constructor of agglomerative hierarchical algorithm.
81
82		@param[in] data (list): Input data that is presented as a list of points (objects), each point should be represented by a list or tuple.
83		@param[in] number_clusters (uint): Number of clusters that should be allocated.
84		@param[in] link (type_link): Link type that is used for calculation similarity between objects and clusters, if it is not specified centroid link will be used by default.
85		@param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
86
87		"""
88
89		self.__pointer_data = data;
90		self.__number_clusters = number_clusters;
91		self.__similarity = link;
92
93		if (self.__similarity is None):
94		self.__similarity = type_link.CENTROID_LINK;
95
96		self.__clusters = [];
97		self.__ccore = ccore;
98
99		if (self.__similarity == type_link.CENTROID_LINK):
100		self.__centers = self.__pointer_data.copy(); # used in case of usage of centroid links
101
102
103		def process(self):
104		"""!
105		@brief Performs cluster analysis in line with rules of agglomerative algorithm and similarity.
106
107		@see get_clusters()
108
109		"""
110
111		if (self.__ccore is True):
112		self.__clusters = wrapper.agglomerative_algorithm(self.__pointer_data, self.__number_clusters, self.__similarity);
113
114		else:
115		self.__clusters = [[index] for index in range(0, len(self.__pointer_data))];
116
117		current_number_clusters = len(self.__clusters);
118
119		while (current_number_clusters > self.__number_clusters):
120		self.__merge_similar_clusters();
121		current_number_clusters = len(self.__clusters);
122
123
124		def get_clusters(self):
125		"""!
126		@brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
127
128		@remark Results of clustering can be obtained using corresponding gets methods.
129
130		@return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
131
132		@see process()
133
134		"""
135
136		return self.__clusters;
137
138
139		def get_cluster_encoding(self):
140		"""!
141		@brief Returns clustering result representation type that indicate how clusters are encoded.
142
143		@return (type_encoding) Clustering result representation.
144
145		@see get_clusters()
146
147		"""
148
149		return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
150
151
152		def __merge_similar_clusters(self):
153		"""!
154		@brief Merges the most similar clusters in line with link type.
155
156		"""
157
158		if (self.__similarity == type_link.AVERAGE_LINK):
159		self.__merge_by_average_link();
160
161		elif (self.__similarity == type_link.CENTROID_LINK):
162		self.__merge_by_centroid_link();
163
164		elif (self.__similarity == type_link.COMPLETE_LINK):
165		self.__merge_by_complete_link();
166
167		elif (self.__similarity == type_link.SINGLE_LINK):
168		self.__merge_by_signle_link();
169
170		else:
171		raise NameError('Not supported similarity is used');
172
173
174		def __merge_by_average_link(self):
175		"""!
176		@brief Merges the most similar clusters in line with average link type.
177
178		"""
179
180		minimum_average_distance = float('Inf');
181
182		for index_cluster1 in range(0, len(self.__clusters)):
183		for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
184
185		# Find farthest objects
186		candidate_average_distance = 0.0;
187		for index_object1 in self.__clusters[index_cluster1]:
188		for index_object2 in self.__clusters[index_cluster2]:
189		candidate_average_distance += euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
190
191		candidate_average_distance /= (len(self.__clusters[index_cluster1]) + len(self.__clusters[index_cluster2]));
192
193		if (candidate_average_distance < minimum_average_distance):
194		minimum_average_distance = candidate_average_distance;
195		indexes = [index_cluster1, index_cluster2];
196
197		self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
198		self.__clusters.pop(indexes[1]); # remove merged cluster.
199
200
201		def __merge_by_centroid_link(self):
202		"""!
203		@brief Merges the most similar clusters in line with centroid link type.
204
205		"""
206
207		minimum_centroid_distance = float('Inf');
208	View Code Duplication	indexes = None;
		0 ignored issues – show Duplication introduced 2016-04-28 20:11 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
209
210		for index1 in range(0, len(self.__centers)):
211		for index2 in range(index1 + 1, len(self.__centers)):
212		distance = euclidean_distance_sqrt(self.__centers[index1], self.__centers[index2]);
213		if (distance < minimum_centroid_distance):
214		minimum_centroid_distance = distance;
215		indexes = [index1, index2];
216
217		self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
218		self.__centers[indexes[0]] = self.__calculate_center(self.__clusters[indexes[0]]);
219
220		self.__clusters.pop(indexes[1]); # remove merged cluster.
221		self.__centers.pop(indexes[1]); # remove merged center.
222
223
224		def __merge_by_complete_link(self):
225		"""!
226		@brief Merges the most similar clusters in line with complete link type.
227
228		"""
229
230		minimum_complete_distance = float('Inf');
231		indexes = None;
232
233		for index_cluster1 in range(0, len(self.__clusters)):
234		for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
235		candidate_maximum_distance = self.__calculate_farthest_distance(index_cluster1, index_cluster2);
236
237		if (candidate_maximum_distance < minimum_complete_distance):
238		minimum_complete_distance = candidate_maximum_distance;
239		indexes = [index_cluster1, index_cluster2];
240
241		self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
242		self.__clusters.pop(indexes[1]); # remove merged cluster.
243
244
245		def __calculate_farthest_distance(self, index_cluster1, index_cluster2):
246		"""!
247		@brief Finds two farthest objects in two specified clusters in terms and returns distance between them.
248
249		@param[in] (uint) Index of the first cluster.
250	View Code Duplication	@param[in] (uint) Index of the second cluster.
		0 ignored issues – show Duplication introduced 2016-04-28 20:11 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
251
252		@return The farthest euclidean distance between two clusters.
253
254		"""
255		candidate_maximum_distance = 0.0;
256		for index_object1 in self.__clusters[index_cluster1]:
257		for index_object2 in self.__clusters[index_cluster2]:
258		distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
259
260		if (distance > candidate_maximum_distance):
261		candidate_maximum_distance = distance;
262
263		return candidate_maximum_distance;
264
265
266		def __merge_by_signle_link(self):
267		"""!
268		@brief Merges the most similar clusters in line with single link type.
269
270		"""
271
272		minimum_single_distance = float('Inf');
273		indexes = None;
274
275		for index_cluster1 in range(0, len(self.__clusters)):
276		for index_cluster2 in range(index_cluster1 + 1, len(self.__clusters)):
277		candidate_minimum_distance = self.__calculate_nearest_distance(index_cluster1, index_cluster2);
278
279		if (candidate_minimum_distance < minimum_single_distance):
280		minimum_single_distance = candidate_minimum_distance;
281		indexes = [index_cluster1, index_cluster2];
282
283		self.__clusters[indexes[0]] += self.__clusters[indexes[1]];
284		self.__clusters.pop(indexes[1]); # remove merged cluster.
285
286
287		def __calculate_nearest_distance(self, index_cluster1, index_cluster2):
288		"""!
289		@brief Finds two nearest objects in two specified clusters and returns distance between them.
290
291		@param[in] (uint) Index of the first cluster.
292		@param[in] (uint) Index of the second cluster.
293
294		@return The nearest euclidean distance between two clusters.
295
296		"""
297		candidate_minimum_distance = float('Inf');
298
299		for index_object1 in self.__clusters[index_cluster1]:
300		for index_object2 in self.__clusters[index_cluster2]:
301		distance = euclidean_distance_sqrt(self.__pointer_data[index_object1], self.__pointer_data[index_object2]);
302		if (distance < candidate_minimum_distance):
303		candidate_minimum_distance = distance;
304
305		return candidate_minimum_distance;
306
307
308		def __calculate_center(self, cluster):
309		"""!
310		@brief Calculates new center.
311
312		@return (list) New value of the center of the specified cluster.
313
314		"""
315
316		dimension = len(self.__pointer_data[cluster[0]]);
317		center = [0] * dimension;
318		for index_point in cluster:
319		for index_dimension in range(0, dimension):
320		center[index_dimension] += self.__pointer_data[index_point][index_dimension];
321
322		for index_dimension in range(0, dimension):
323		center[index_dimension] /= len(cluster);
324
325		return center;

annoviko / pyclustering

Push — master ( 6ed467...f3f897 )

agglomerative B

Complexity

Size/Duplication

Importance

12 Methods

How to fix Duplicated Code Complexity

Duplicated Code

Complex Class

Duplication Side-by-Side

Filter issues like