birch.get_cluster_encoding() - Code Metrics - Inspection of "[pyclustering.cluster] Clustering result encoder,..." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 6ed467...f3f897 )

by Andrei

created 2016-12-23 11:53 UTC

birch.get_cluster_encoding() A

↳ Parent: birch

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
dl	0
loc	11
rs	9.4285
c	0
b	0
f	0

"""!

@brief Cluster analysis algorithm: BIRCH
@details Implementation based on article:
         - T.Zhang, R.Ramakrishnan, M.Livny. BIRCH: An Efficient Data Clustering Method for Very Large Databases. 1996.
         
@authors Andrei Novikov ([email protected])
@date 2014-2016
@copyright GNU Public License

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""


from pyclustering.utils import linear_sum, square_sum;

from pyclustering.cluster.encoder import type_encoding;

from pyclustering.container.cftree import cftree, cfentry, measurement_type;

from copy import copy;



class birch:
    """!
    @brief Class represents clustering algorithm BIRCH.
    
    Example:
    @code
        # sample for cluster analysis (represented by list)
        sample = read_sample(path_to_sample);
        
        # create object of birch that uses CCORE for processing
        birch_instance = birch(sample, 2, 5, 5, 0.05, measurement_type.CENTROID_EUCLIDIAN_DISTANCE, 200, True);
        
        # cluster analysis
        birch_instance.process();
        
        # obtain results of clustering
        clusters = birch_instance.get_clusters();
    @endcode
    
    """
    
    def __init__(self, data, number_clusters, branching_factor = 5, max_node_entries = 5, initial_diameter = 0.1, type_measurement = measurement_type.CENTROID_EUCLIDIAN_DISTANCE, entry_size_limit = 200, diameter_multiplier = 1.5, outlier_detector = 0, ccore = False):
        """!
        @brief Constructor of clustering algorithm BIRCH.
        
        @param[in] data (list): Input data presented as list of points (objects), where each point should be represented by list or tuple.
        @param[in] number_clusters (uint): Number of clusters that should be allocated.
        @param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree.
        @param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree.
        @param[in] initial_diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded.
        @param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics.
        @param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded during creation then diameter is increased and CF-Tree is rebuilt.
        @param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when entry_size_limit is exceeded.
        @param[in] outlier_detector (uint): Minimum number of data points that should be contained by node to be considered as non-outlier node.
        @param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving the problem.
        
        @remark Despite eight arguments only the first two is mandatory, others can be ommitted. In this case default values are used for instance creation.
        
        Example:
        @code
            birch_instance1 = birch(sample1, 2);    # two clusters should be allocated
            birch_instance2 = birch(sample2, 5);    # five clusters should be allocated
            
            # three clusters should be allocated, but also each leaf node can have maximum 5 
            # entries and each entry can have maximum 5 descriptors with initial diameter 0.05.
            birch_instance3 = birch(sample3, 3, 5, 5, 0.05);
        @endcode
        
        """
        
        self.__pointer_data = data;
        self.__number_clusters = number_clusters;
        
        self.__measurement_type = type_measurement;
        self.__entry_size_limit = entry_size_limit;
        self.__diameter_multiplier = diameter_multiplier;
        self.__outlier_detector = outlier_detector;
        self.__ccore = ccore;
        
        self.__features = None;
        self.__tree = cftree(branching_factor, max_node_entries, initial_diameter, type_measurement);
        
        self.__clusters = [];
        self.__noise = [];


    def process(self):
        """!
        @brief Performs cluster analysis in line with rules of BIRCH algorithm.
        
        @remark Results of clustering can be obtained using corresponding gets methods.
        
        @see get_clusters()
        @see get_noise()
        
        """
        
        self.__insert_data();
        self.__extract_features();

        # in line with specification modify hierarchical algorithm should be used for further clustering
        current_number_clusters = len(self.__features);
        
        while (current_number_clusters > self.__number_clusters):
            indexes = self.__find_nearest_cluster_features();
            
            self.__features[indexes[0]] += self.__features[indexes[1]];
            self.__features.pop(indexes[1]);
            
            current_number_clusters = len(self.__features);
            
        # decode data
        self.__decode_data();
    
    
    def get_clusters(self):
        """!
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
        
        @remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
        
        @return (list) List of allocated clusters.
        
        @see process()
        @see get_noise()
        
        """
        
        return self.__clusters;
    
    
    def get_noise(self):
        """!
        @brief Returns allocated noise.
        
        @remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
        
        @return (list) List of indexes that are marked as a noise.
        
        @see process()
        @see get_clusters()
        
        """
        
        return self.__noise;


    def get_cluster_encoding(self):
        """!
        @brief Returns clustering result representation type that indicate how clusters are encoded.
        
        @return (type_encoding) Clustering result representation.
        
        @see get_clusters()
        
        """
        
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;


    def __extract_features(self):
        """!
        @brief Extracts features and outlier features from CF-tree cluster.
        
        """
        
        self.__features = [];
        self.__outlier_features = [];
        
        if (len(self.__tree.leafes) == 1):
            # parameters are too general, copy all entries
            for entry in self.__tree.leafes[0].entries:
                if (self.__outlier_detector < entry.number_points):
                    self.__features.append(entry);
                else:
                    self.__outlier_features.append(entry);
        else:
            # copy all leaf clustering features
            for node in self.__tree.leafes:
                if (self.__outlier_detector < node.feature.number_points):
                    self.__features.append(node.feature);
                else:

                    self.__outlier_features.append(node.feature);
    
    
    def __decode_data(self):
        """!
        @brief Decodes data from CF-tree features.
        
        """
        
        self.__clusters = [ [] for _ in range(self.__number_clusters) ];
        self.__noise = [];
        
        for index_point in range(0, len(self.__pointer_data)):
            (cluster_distance, cluster_index) = self.__get_nearest_feature(self.__pointer_data[index_point], self.__features);
            (outlier_distance, _) = self.__get_nearest_feature(self.__pointer_data[index_point], self.__outlier_features);
            
            if (cluster_distance < outlier_distance):
                self.__clusters[cluster_index].append(index_point);
            else:
                self.__noise.append(index_point);
    
    
    def __insert_data(self):
        """!
        @brief Inserts input data to the tree.
        
        @remark If number of maximum number of entries is exceeded than diameter is increased and tree is rebuilt.
        
        """
        
        for index_point in range(0, len(self.__pointer_data)):
            point = self.__pointer_data[index_point];
            self.__tree.insert_cluster( [ point ] );
            
            if (self.__tree.amount_entries > self.__entry_size_limit):
                self.__tree = self.__rebuild_tree(index_point);
        
        #self.__tree.show_feature_destibution(self.__pointer_data);
    
    
    def __rebuild_tree(self, index_point):
        """!
        @brief Rebuilt tree in case of maxumum number of entries is exceeded.
        
        @param[in] index_point (uint): Index of point that is used as end point of re-building.
        
        @return (cftree) Rebuilt tree with encoded points till specified point from input data space.
        
        """
        
        rebuild_result = False;
        increased_diameter = self.__tree.threshold * self.__diameter_multiplier;
        
        tree = None;
        
        while(rebuild_result is False):
            # increase diameter and rebuild tree
            if (increased_diameter == 0.0):
                increased_diameter = 1.0;
            
            # build tree with update parameters
            tree = cftree(self.__tree.branch_factor, self.__tree.max_entries, increased_diameter, self.__tree.type_measurement);
            
            for index_point in range(0, index_point + 1):
                point = self.__pointer_data[index_point];
                tree.insert_cluster([point]);
            
                if (tree.amount_entries > self.__entry_size_limit):
                    increased_diameter *= self.__diameter_multiplier;
                    continue;
            
            # Re-build is successful.
            rebuild_result = True;
        
        return tree;
    
    
    def __find_nearest_cluster_features(self):
        """!
        @brief Find pair of nearest CF entries.
        
        @return (list) List of two nearest enties that are represented by list [index_point1, index_point2].
        
        """
        
        minimum_distance = float("Inf");
        index1 = 0;
        index2 = 0;
        
        for index_candidate1 in range(0, len(self.__features)):
            feature1 = self.__features[index_candidate1];
            for index_candidate2 in range(index_candidate1 + 1, len(self.__features)):
                feature2 = self.__features[index_candidate2];
                
                distance = feature1.get_distance(feature2, self.__measurement_type);
                if (distance < minimum_distance):
                    minimum_distance = distance;
                    
                    index1 = index_candidate1;
                    index2 = index_candidate2;
        
        return [index1, index2];
    
    
    def __get_nearest_feature(self, point, feature_collection):
        """!
        @brief Find nearest entry for specified point.
        
        @param[in] point (list): Pointer to point from input dataset.
        @param[in] feature_collection (list): Feature collection that is used for obtaining nearest feature for the specified point.
        
        @return (double, uint) Tuple of distance to nearest entry to the specified point and index of that entry.
        
        """
        
        minimum_distance = float("Inf");
        index_nearest_feature = -1;
        
        for index_entry in range(0, len(feature_collection)):
            point_entry = cfentry(1, linear_sum([ point ]), square_sum([ point ]));
            
            distance = feature_collection[index_entry].get_distance(point_entry, self.__measurement_type);
            if (distance < minimum_distance):
                minimum_distance = distance;
                index_nearest_feature = index_entry;
                
        return (minimum_distance, index_nearest_feature);


1		"""!
2
3		@brief Cluster analysis algorithm: BIRCH
4		@details Implementation based on article:
5		- T.Zhang, R.Ramakrishnan, M.Livny. BIRCH: An Efficient Data Clustering Method for Very Large Databases. 1996.
6
7		@authors Andrei Novikov ([email protected])
8		@date 2014-2016
9		@copyright GNU Public License
10
11		@cond GNU_PUBLIC_LICENSE
12		PyClustering is free software: you can redistribute it and/or modify
13		it under the terms of the GNU General Public License as published by
14		the Free Software Foundation, either version 3 of the License, or
15		(at your option) any later version.
16
17		PyClustering is distributed in the hope that it will be useful,
18		but WITHOUT ANY WARRANTY; without even the implied warranty of
19		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20		GNU General Public License for more details.
21
22		You should have received a copy of the GNU General Public License
23		along with this program. If not, see <http://www.gnu.org/licenses/>.
24		@endcond
25
26		"""
27
28
29		from pyclustering.utils import linear_sum, square_sum;
30
31		from pyclustering.cluster.encoder import type_encoding;
32
33		from pyclustering.container.cftree import cftree, cfentry, measurement_type;
34
35		from copy import copy;
		0 ignored issues – show Unused Code introduced 2016-06-08 16:10 UTC by Report Bug Copy Issue Report Unused copy imported from copy Loading history...
36
37
38		class birch:
39		"""!
40		@brief Class represents clustering algorithm BIRCH.
41
42		Example:
43		@code
44		# sample for cluster analysis (represented by list)
45		sample = read_sample(path_to_sample);
46
47		# create object of birch that uses CCORE for processing
48		birch_instance = birch(sample, 2, 5, 5, 0.05, measurement_type.CENTROID_EUCLIDIAN_DISTANCE, 200, True);
49
50		# cluster analysis
51		birch_instance.process();
52
53		# obtain results of clustering
54		clusters = birch_instance.get_clusters();
55		@endcode
56
57		"""
58
59		def __init__(self, data, number_clusters, branching_factor = 5, max_node_entries = 5, initial_diameter = 0.1, type_measurement = measurement_type.CENTROID_EUCLIDIAN_DISTANCE, entry_size_limit = 200, diameter_multiplier = 1.5, outlier_detector = 0, ccore = False):
60		"""!
61		@brief Constructor of clustering algorithm BIRCH.
62
63		@param[in] data (list): Input data presented as list of points (objects), where each point should be represented by list or tuple.
64		@param[in] number_clusters (uint): Number of clusters that should be allocated.
65		@param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree.
66		@param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree.
67		@param[in] initial_diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded.
68		@param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics.
69		@param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded during creation then diameter is increased and CF-Tree is rebuilt.
70		@param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when entry_size_limit is exceeded.
71		@param[in] outlier_detector (uint): Minimum number of data points that should be contained by node to be considered as non-outlier node.
72		@param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving the problem.
73
74		@remark Despite eight arguments only the first two is mandatory, others can be ommitted. In this case default values are used for instance creation.
75
76		Example:
77		@code
78		birch_instance1 = birch(sample1, 2); # two clusters should be allocated
79		birch_instance2 = birch(sample2, 5); # five clusters should be allocated
80
81		# three clusters should be allocated, but also each leaf node can have maximum 5
82		# entries and each entry can have maximum 5 descriptors with initial diameter 0.05.
83		birch_instance3 = birch(sample3, 3, 5, 5, 0.05);
84		@endcode
85
86		"""
87
88		self.__pointer_data = data;
89		self.__number_clusters = number_clusters;
90
91		self.__measurement_type = type_measurement;
92		self.__entry_size_limit = entry_size_limit;
93		self.__diameter_multiplier = diameter_multiplier;
94		self.__outlier_detector = outlier_detector;
95		self.__ccore = ccore;
96
97		self.__features = None;
98		self.__tree = cftree(branching_factor, max_node_entries, initial_diameter, type_measurement);
99
100		self.__clusters = [];
101		self.__noise = [];
102
103
104		def process(self):
105		"""!
106		@brief Performs cluster analysis in line with rules of BIRCH algorithm.
107
108		@remark Results of clustering can be obtained using corresponding gets methods.
109
110		@see get_clusters()
111		@see get_noise()
112
113		"""
114
115		self.__insert_data();
116		self.__extract_features();
117
118		# in line with specification modify hierarchical algorithm should be used for further clustering
119		current_number_clusters = len(self.__features);
120
121		while (current_number_clusters > self.__number_clusters):
122		indexes = self.__find_nearest_cluster_features();
123
124		self.__features[indexes[0]] += self.__features[indexes[1]];
125		self.__features.pop(indexes[1]);
126
127		current_number_clusters = len(self.__features);
128
129		# decode data
130		self.__decode_data();
131
132
133		def get_clusters(self):
134		"""!
135		@brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
136
137		@remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
138
139		@return (list) List of allocated clusters.
140
141		@see process()
142		@see get_noise()
143
144		"""
145
146		return self.__clusters;
147
148
149		def get_noise(self):
150		"""!
151		@brief Returns allocated noise.
152
153		@remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
154
155		@return (list) List of indexes that are marked as a noise.
156
157		@see process()
158		@see get_clusters()
159
160		"""
161
162		return self.__noise;
163
164
165		def get_cluster_encoding(self):
166		"""!
167		@brief Returns clustering result representation type that indicate how clusters are encoded.
168
169		@return (type_encoding) Clustering result representation.
170
171		@see get_clusters()
172
173		"""
174
175		return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
176
177
178		def __extract_features(self):
179		"""!
180		@brief Extracts features and outlier features from CF-tree cluster.
181
182		"""
183
184		self.__features = [];
185		self.__outlier_features = [];
186
187		if (len(self.__tree.leafes) == 1):
188		# parameters are too general, copy all entries
189		for entry in self.__tree.leafes[0].entries:
190		if (self.__outlier_detector < entry.number_points):
191		self.__features.append(entry);
192		else:
193		self.__outlier_features.append(entry);
194		else:
195		# copy all leaf clustering features
196		for node in self.__tree.leafes:
197		if (self.__outlier_detector < node.feature.number_points):
198		self.__features.append(node.feature);
199	View Code Duplication	else:
		0 ignored issues – show Duplication introduced 2016-04-28 20:11 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
200		self.__outlier_features.append(node.feature);
201
202
203		def __decode_data(self):
204		"""!
205		@brief Decodes data from CF-tree features.
206
207		"""
208
209		self.__clusters = [ [] for _ in range(self.__number_clusters) ];
210		self.__noise = [];
211
212		for index_point in range(0, len(self.__pointer_data)):
213		(cluster_distance, cluster_index) = self.__get_nearest_feature(self.__pointer_data[index_point], self.__features);
214		(outlier_distance, _) = self.__get_nearest_feature(self.__pointer_data[index_point], self.__outlier_features);
215
216		if (cluster_distance < outlier_distance):
217		self.__clusters[cluster_index].append(index_point);
218		else:
219		self.__noise.append(index_point);
220
221
222		def __insert_data(self):
223		"""!
224		@brief Inserts input data to the tree.
225
226		@remark If number of maximum number of entries is exceeded than diameter is increased and tree is rebuilt.
227
228		"""
229
230		for index_point in range(0, len(self.__pointer_data)):
231		point = self.__pointer_data[index_point];
232		self.__tree.insert_cluster( [ point ] );
233
234		if (self.__tree.amount_entries > self.__entry_size_limit):
235		self.__tree = self.__rebuild_tree(index_point);
236
237		#self.__tree.show_feature_destibution(self.__pointer_data);
238
239
240		def __rebuild_tree(self, index_point):
241		"""!
242		@brief Rebuilt tree in case of maxumum number of entries is exceeded.
243
244		@param[in] index_point (uint): Index of point that is used as end point of re-building.
245
246		@return (cftree) Rebuilt tree with encoded points till specified point from input data space.
247
248		"""
249
250		rebuild_result = False;
251		increased_diameter = self.__tree.threshold * self.__diameter_multiplier;
252
253		tree = None;
254
255		while(rebuild_result is False):
256		# increase diameter and rebuild tree
257		if (increased_diameter == 0.0):
258		increased_diameter = 1.0;
259
260		# build tree with update parameters
261		tree = cftree(self.__tree.branch_factor, self.__tree.max_entries, increased_diameter, self.__tree.type_measurement);
262
263		for index_point in range(0, index_point + 1):
264		point = self.__pointer_data[index_point];
265		tree.insert_cluster([point]);
266
267		if (tree.amount_entries > self.__entry_size_limit):
268		increased_diameter *= self.__diameter_multiplier;
269		continue;
270
271		# Re-build is successful.
272		rebuild_result = True;
273
274		return tree;
275
276
277		def __find_nearest_cluster_features(self):
278		"""!
279		@brief Find pair of nearest CF entries.
280
281		@return (list) List of two nearest enties that are represented by list [index_point1, index_point2].
282
283		"""
284
285		minimum_distance = float("Inf");
286		index1 = 0;
287		index2 = 0;
288
289		for index_candidate1 in range(0, len(self.__features)):
290		feature1 = self.__features[index_candidate1];
291		for index_candidate2 in range(index_candidate1 + 1, len(self.__features)):
292		feature2 = self.__features[index_candidate2];
293
294		distance = feature1.get_distance(feature2, self.__measurement_type);
295		if (distance < minimum_distance):
296		minimum_distance = distance;
297
298		index1 = index_candidate1;
299		index2 = index_candidate2;
300
301		return [index1, index2];
302
303
304		def __get_nearest_feature(self, point, feature_collection):
305		"""!
306		@brief Find nearest entry for specified point.
307
308		@param[in] point (list): Pointer to point from input dataset.
309		@param[in] feature_collection (list): Feature collection that is used for obtaining nearest feature for the specified point.
310
311		@return (double, uint) Tuple of distance to nearest entry to the specified point and index of that entry.
312
313		"""
314
315		minimum_distance = float("Inf");
316		index_nearest_feature = -1;
317
318		for index_entry in range(0, len(feature_collection)):
319		point_entry = cfentry(1, linear_sum([ point ]), square_sum([ point ]));
320
321		distance = feature_collection[index_entry].get_distance(point_entry, self.__measurement_type);
322		if (distance < minimum_distance):
323		minimum_distance = distance;
324		index_nearest_feature = index_entry;
325
326		return (minimum_distance, index_nearest_feature);
327

annoviko / pyclustering

Push — master ( 6ed467...f3f897 )

birch.get_cluster_encoding() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like