birch.__extract_features() - Code Metrics - Inspection of "[pyclustering.container.cftree] Bug correction (is..." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( b2f733...eb13df )

by Andrei

created 2016-06-08 16:08 UTC

birch.__extract_features() B

↳ Parent: birch

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	6
dl	0
loc	23
rs	7.6949
c	0
b	0
f	0

"""!

@brief Cluster analysis algorithm: BIRCH
@details Implementation based on article:
         - T.Zhang, R.Ramakrishnan, M.Livny. BIRCH: An Efficient Data Clustering Method for Very Large Databases. 1996.
         
@authors Andrei Novikov ([email protected])
@date 2014-2016
@copyright GNU Public License

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""

from pyclustering.utils import linear_sum, square_sum;

from pyclustering.container.cftree import cftree, cfentry, measurement_type;

from copy import copy;


class birch:
    """!
    @brief Class represents clustering algorithm BIRCH.
    
    Example:
    @code
        # sample for cluster analysis (represented by list)
        sample = read_sample(path_to_sample);
        
        # create object of birch that uses CCORE for processing
        birch_instance = birch(sample, 2, 5, 5, 0.05, measurement_type.CENTROID_EUCLIDIAN_DISTANCE, 200, True);
        
        # cluster analysis
        birch_instance.process();
        
        # obtain results of clustering
        clusters = birch_instance.get_clusters();
    @endcode
    
    """
    
    def __init__(self, data, number_clusters, branching_factor = 5, max_node_entries = 5, initial_diameter = 0.1, type_measurement = measurement_type.CENTROID_EUCLIDIAN_DISTANCE, entry_size_limit = 200, diameter_multiplier = 1.5, outlier_detector = 0, ccore = False):
        """!
        @brief Constructor of clustering algorithm BIRCH.
        
        @param[in] data (list): Input data presented as list of points (objects), where each point should be represented by list or tuple.
        @param[in] number_clusters (uint): Number of clusters that should be allocated.
        @param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree.
        @param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree.
        @param[in] initial_diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded.
        @param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics.
        @param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded during creation then diameter is increased and CF-Tree is rebuilt.
        @param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when entry_size_limit is exceeded.
        @param[in] outlier_detector (uint): Minimum number of data points that should be contained by node to be considered as non-outlier node.
        @param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving the problem.
        
        @remark Despite eight arguments only the first two is mandatory, others can be ommitted. In this case default values are used for instance creation.
        
        Example:
        @code
            birch_instance1 = birch(sample1, 2);    # two clusters should be allocated
            birch_instance2 = birch(sample2, 5);    # five clusters should be allocated
            
            # three clusters should be allocated, but also each leaf node can have maximum 5 
            # entries and each entry can have maximum 5 descriptors with initial diameter 0.05.
            birch_instance3 = birch(sample3, 3, 5, 5, 0.05);
        @endcode
        
        """
        
        self.__pointer_data = data;
        self.__number_clusters = number_clusters;
        
        self.__measurement_type = type_measurement;
        self.__entry_size_limit = entry_size_limit;
        self.__diameter_multiplier = diameter_multiplier;
        self.__outlier_detector = outlier_detector;
        self.__ccore = ccore;
        
        self.__features = None;
        self.__tree = cftree(branching_factor, max_node_entries, initial_diameter, type_measurement);
        
        self.__clusters = [];
        self.__noise = [];


    def process(self):
        """!
        @brief Performs cluster analysis in line with rules of BIRCH algorithm.
        
        @remark Results of clustering can be obtained using corresponding gets methods.
        
        @see get_clusters()
        @see get_noise()
        
        """
        
        self.__insert_data();
        self.__extract_features();

        # in line with specification modify hierarchical algorithm should be used for further clustering
        current_number_clusters = len(self.__features);
        
        while (current_number_clusters > self.__number_clusters):
            indexes = self.__find_nearest_cluster_features();
            
            self.__features[indexes[0]] += self.__features[indexes[1]];
            self.__features.pop(indexes[1]);
            
            current_number_clusters = len(self.__features);
            
        # decode data
        self.__decode_data();
    
    
    def get_clusters(self):
        """!
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
        
        @remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
        
        @return (list) List of allocated clusters.
        
        @see process()
        @see get_noise()
        
        """
        
        return self.__clusters;
    
    
    def get_noise(self):
        """!
        @brief Returns allocated noise.
        
        @remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
        
        @return (list) List of indexes that are marked as a noise.
        
        @see process()
        @see get_clusters()
        
        """
        
        return self.__noise;
    
    
    def __extract_features(self):
        """!
        @brief Extracts features and outlier features from CF-tree cluster.
        
        """
        
        self.__features = [];
        self.__outlier_features = [];
        
        if (len(self.__tree.leafes) == 1):
            # parameters are too general, copy all entries
            for entry in self.__tree.leafes[0].entries:
                if (self.__outlier_detector < entry.number_points):
                    self.__features.append(entry);
                else:
                    self.__outlier_features.append(entry);
        else:
            # copy all leaf clustering features
            for node in self.__tree.leafes:
                if (self.__outlier_detector < node.feature.number_points):
                    self.__features.append(node.feature);
                else:
                    self.__outlier_features.append(node.feature);
    
    
    def __decode_data(self):
        """!
        @brief Decodes data from CF-tree features.
        
        """
        
        self.__clusters = [ [] for _ in range(self.__number_clusters) ];
        self.__noise = [];
        
        for index_point in range(0, len(self.__pointer_data)):
            (cluster_distance, cluster_index) = self.__get_nearest_feature(self.__pointer_data[index_point], self.__features);
            (outlier_distance, _) = self.__get_nearest_feature(self.__pointer_data[index_point], self.__outlier_features);
            
            if (cluster_distance < outlier_distance):

                self.__clusters[cluster_index].append(index_point);
            else:
                self.__noise.append(index_point);
    
    
    def __insert_data(self):
        """!
        @brief Inserts input data to the tree.
        
        @remark If number of maximum number of entries is exceeded than diameter is increased and tree is rebuilt.
        
        """
        
        for index_point in range(0, len(self.__pointer_data)):
            point = self.__pointer_data[index_point];
            self.__tree.insert_cluster( [ point ] );
            
            if (self.__tree.amount_entries > self.__entry_size_limit):
                self.__tree = self.__rebuild_tree(index_point);
        
        #self.__tree.show_feature_destibution(self.__pointer_data);
    
    
    def __rebuild_tree(self, index_point):
        """!
        @brief Rebuilt tree in case of maxumum number of entries is exceeded.
        
        @param[in] index_point (uint): Index of point that is used as end point of re-building.
        
        @return (cftree) Rebuilt tree with encoded points till specified point from input data space.
        
        """
        
        rebuild_result = False;
        increased_diameter = self.__tree.threshold * self.__diameter_multiplier;
        
        tree = None;
        
        while(rebuild_result is False):
            # increase diameter and rebuild tree
            if (increased_diameter == 0.0):
                increased_diameter = 1.0;
            
            # build tree with update parameters
            tree = cftree(self.__tree.branch_factor, self.__tree.max_entries, increased_diameter, self.__tree.type_measurement);
            
            for index_point in range(0, index_point + 1):
                point = self.__pointer_data[index_point];
                tree.insert_cluster([point]);
            
                if (tree.amount_entries > self.__entry_size_limit):
                    increased_diameter *= self.__diameter_multiplier;
                    continue;
            
            # Re-build is successful.
            rebuild_result = True;
        
        return tree;
    
    
    def __find_nearest_cluster_features(self):
        """!
        @brief Find pair of nearest CF entries.
        
        @return (list) List of two nearest enties that are represented by list [index_point1, index_point2].
        
        """
        
        minimum_distance = float("Inf");
        index1 = 0;
        index2 = 0;
        
        for index_candidate1 in range(0, len(self.__features)):
            feature1 = self.__features[index_candidate1];
            for index_candidate2 in range(index_candidate1 + 1, len(self.__features)):
                feature2 = self.__features[index_candidate2];
                
                distance = feature1.get_distance(feature2, self.__measurement_type);
                if (distance < minimum_distance):
                    minimum_distance = distance;
                    
                    index1 = index_candidate1;
                    index2 = index_candidate2;
        
        return [index1, index2];
    
    
    def __get_nearest_feature(self, point, feature_collection):
        """!
        @brief Find nearest entry for specified point.
        
        @param[in] point (list): Pointer to point from input dataset.
        @param[in] feature_collection (list): Feature collection that is used for obtaining nearest feature for the specified point.
        
        @return (double, uint) Tuple of distance to nearest entry to the specified point and index of that entry.
        
        """
        
        minimum_distance = float("Inf");
        index_nearest_feature = -1;
        
        for index_entry in range(0, len(feature_collection)):
            point_entry = cfentry(1, linear_sum([ point ]), square_sum([ point ]));
            
            distance = feature_collection[index_entry].get_distance(point_entry, self.__measurement_type);
            if (distance < minimum_distance):
                minimum_distance = distance;
                index_nearest_feature = index_entry;
                
        return (minimum_distance, index_nearest_feature);


1		"""!
2
3		@brief Cluster analysis algorithm: BIRCH
4		@details Implementation based on article:
5		- T.Zhang, R.Ramakrishnan, M.Livny. BIRCH: An Efficient Data Clustering Method for Very Large Databases. 1996.
6
7		@authors Andrei Novikov ([email protected])
8		@date 2014-2016
9		@copyright GNU Public License
10
11		@cond GNU_PUBLIC_LICENSE
12		PyClustering is free software: you can redistribute it and/or modify
13		it under the terms of the GNU General Public License as published by
14		the Free Software Foundation, either version 3 of the License, or
15		(at your option) any later version.
16
17		PyClustering is distributed in the hope that it will be useful,
18		but WITHOUT ANY WARRANTY; without even the implied warranty of
19		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20		GNU General Public License for more details.
21
22		You should have received a copy of the GNU General Public License
23		along with this program. If not, see <http://www.gnu.org/licenses/>.
24		@endcond
25
26		"""
27
28		from pyclustering.utils import linear_sum, square_sum;
29
30		from pyclustering.container.cftree import cftree, cfentry, measurement_type;
31
32		from copy import copy;
		0 ignored issues – show Unused Code introduced 2016-06-08 16:10 UTC by Report Bug Copy Issue Report Unused copy imported from copy Loading history...
33
34		class birch:
35		"""!
36		@brief Class represents clustering algorithm BIRCH.
37
38		Example:
39		@code
40		# sample for cluster analysis (represented by list)
41		sample = read_sample(path_to_sample);
42
43		# create object of birch that uses CCORE for processing
44		birch_instance = birch(sample, 2, 5, 5, 0.05, measurement_type.CENTROID_EUCLIDIAN_DISTANCE, 200, True);
45
46		# cluster analysis
47		birch_instance.process();
48
49		# obtain results of clustering
50		clusters = birch_instance.get_clusters();
51		@endcode
52
53		"""
54
55		def __init__(self, data, number_clusters, branching_factor = 5, max_node_entries = 5, initial_diameter = 0.1, type_measurement = measurement_type.CENTROID_EUCLIDIAN_DISTANCE, entry_size_limit = 200, diameter_multiplier = 1.5, outlier_detector = 0, ccore = False):
56		"""!
57		@brief Constructor of clustering algorithm BIRCH.
58
59		@param[in] data (list): Input data presented as list of points (objects), where each point should be represented by list or tuple.
60		@param[in] number_clusters (uint): Number of clusters that should be allocated.
61		@param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree.
62		@param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree.
63		@param[in] initial_diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded.
64		@param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics.
65		@param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded during creation then diameter is increased and CF-Tree is rebuilt.
66		@param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when entry_size_limit is exceeded.
67		@param[in] outlier_detector (uint): Minimum number of data points that should be contained by node to be considered as non-outlier node.
68		@param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving the problem.
69
70		@remark Despite eight arguments only the first two is mandatory, others can be ommitted. In this case default values are used for instance creation.
71
72		Example:
73		@code
74		birch_instance1 = birch(sample1, 2); # two clusters should be allocated
75		birch_instance2 = birch(sample2, 5); # five clusters should be allocated
76
77		# three clusters should be allocated, but also each leaf node can have maximum 5
78		# entries and each entry can have maximum 5 descriptors with initial diameter 0.05.
79		birch_instance3 = birch(sample3, 3, 5, 5, 0.05);
80		@endcode
81
82		"""
83
84		self.__pointer_data = data;
85		self.__number_clusters = number_clusters;
86
87		self.__measurement_type = type_measurement;
88		self.__entry_size_limit = entry_size_limit;
89		self.__diameter_multiplier = diameter_multiplier;
90		self.__outlier_detector = outlier_detector;
91		self.__ccore = ccore;
92
93		self.__features = None;
94		self.__tree = cftree(branching_factor, max_node_entries, initial_diameter, type_measurement);
95
96		self.__clusters = [];
97		self.__noise = [];
98
99
100		def process(self):
101		"""!
102		@brief Performs cluster analysis in line with rules of BIRCH algorithm.
103
104		@remark Results of clustering can be obtained using corresponding gets methods.
105
106		@see get_clusters()
107		@see get_noise()
108
109		"""
110
111		self.__insert_data();
112		self.__extract_features();
113
114		# in line with specification modify hierarchical algorithm should be used for further clustering
115		current_number_clusters = len(self.__features);
116
117		while (current_number_clusters > self.__number_clusters):
118		indexes = self.__find_nearest_cluster_features();
119
120		self.__features[indexes[0]] += self.__features[indexes[1]];
121		self.__features.pop(indexes[1]);
122
123		current_number_clusters = len(self.__features);
124
125		# decode data
126		self.__decode_data();
127
128
129		def get_clusters(self):
130		"""!
131		@brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
132
133		@remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
134
135		@return (list) List of allocated clusters.
136
137		@see process()
138		@see get_noise()
139
140		"""
141
142		return self.__clusters;
143
144
145		def get_noise(self):
146		"""!
147		@brief Returns allocated noise.
148
149		@remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
150
151		@return (list) List of indexes that are marked as a noise.
152
153		@see process()
154		@see get_clusters()
155
156		"""
157
158		return self.__noise;
159
160
161		def __extract_features(self):
162		"""!
163		@brief Extracts features and outlier features from CF-tree cluster.
164
165		"""
166
167		self.__features = [];
168		self.__outlier_features = [];
169
170		if (len(self.__tree.leafes) == 1):
171		# parameters are too general, copy all entries
172		for entry in self.__tree.leafes[0].entries:
173		if (self.__outlier_detector < entry.number_points):
174		self.__features.append(entry);
175		else:
176		self.__outlier_features.append(entry);
177		else:
178		# copy all leaf clustering features
179		for node in self.__tree.leafes:
180		if (self.__outlier_detector < node.feature.number_points):
181		self.__features.append(node.feature);
182		else:
183		self.__outlier_features.append(node.feature);
184
185
186		def __decode_data(self):
187		"""!
188		@brief Decodes data from CF-tree features.
189
190		"""
191
192		self.__clusters = [ [] for _ in range(self.__number_clusters) ];
193		self.__noise = [];
194
195		for index_point in range(0, len(self.__pointer_data)):
196		(cluster_distance, cluster_index) = self.__get_nearest_feature(self.__pointer_data[index_point], self.__features);
197		(outlier_distance, _) = self.__get_nearest_feature(self.__pointer_data[index_point], self.__outlier_features);
198
199	View Code Duplication	if (cluster_distance < outlier_distance):
		0 ignored issues – show Duplication introduced 2016-04-28 20:11 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
200		self.__clusters[cluster_index].append(index_point);
201		else:
202		self.__noise.append(index_point);
203
204
205		def __insert_data(self):
206		"""!
207		@brief Inserts input data to the tree.
208
209		@remark If number of maximum number of entries is exceeded than diameter is increased and tree is rebuilt.
210
211		"""
212
213		for index_point in range(0, len(self.__pointer_data)):
214		point = self.__pointer_data[index_point];
215		self.__tree.insert_cluster( [ point ] );
216
217		if (self.__tree.amount_entries > self.__entry_size_limit):
218		self.__tree = self.__rebuild_tree(index_point);
219
220		#self.__tree.show_feature_destibution(self.__pointer_data);
221
222
223		def __rebuild_tree(self, index_point):
224		"""!
225		@brief Rebuilt tree in case of maxumum number of entries is exceeded.
226
227		@param[in] index_point (uint): Index of point that is used as end point of re-building.
228
229		@return (cftree) Rebuilt tree with encoded points till specified point from input data space.
230
231		"""
232
233		rebuild_result = False;
234		increased_diameter = self.__tree.threshold * self.__diameter_multiplier;
235
236		tree = None;
237
238		while(rebuild_result is False):
239		# increase diameter and rebuild tree
240		if (increased_diameter == 0.0):
241		increased_diameter = 1.0;
242
243		# build tree with update parameters
244		tree = cftree(self.__tree.branch_factor, self.__tree.max_entries, increased_diameter, self.__tree.type_measurement);
245
246		for index_point in range(0, index_point + 1):
247		point = self.__pointer_data[index_point];
248		tree.insert_cluster([point]);
249
250		if (tree.amount_entries > self.__entry_size_limit):
251		increased_diameter *= self.__diameter_multiplier;
252		continue;
253
254		# Re-build is successful.
255		rebuild_result = True;
256
257		return tree;
258
259
260		def __find_nearest_cluster_features(self):
261		"""!
262		@brief Find pair of nearest CF entries.
263
264		@return (list) List of two nearest enties that are represented by list [index_point1, index_point2].
265
266		"""
267
268		minimum_distance = float("Inf");
269		index1 = 0;
270		index2 = 0;
271
272		for index_candidate1 in range(0, len(self.__features)):
273		feature1 = self.__features[index_candidate1];
274		for index_candidate2 in range(index_candidate1 + 1, len(self.__features)):
275		feature2 = self.__features[index_candidate2];
276
277		distance = feature1.get_distance(feature2, self.__measurement_type);
278		if (distance < minimum_distance):
279		minimum_distance = distance;
280
281		index1 = index_candidate1;
282		index2 = index_candidate2;
283
284		return [index1, index2];
285
286
287		def __get_nearest_feature(self, point, feature_collection):
288		"""!
289		@brief Find nearest entry for specified point.
290
291		@param[in] point (list): Pointer to point from input dataset.
292		@param[in] feature_collection (list): Feature collection that is used for obtaining nearest feature for the specified point.
293
294		@return (double, uint) Tuple of distance to nearest entry to the specified point and index of that entry.
295
296		"""
297
298		minimum_distance = float("Inf");
299		index_nearest_feature = -1;
300
301		for index_entry in range(0, len(feature_collection)):
302		point_entry = cfentry(1, linear_sum([ point ]), square_sum([ point ]));
303
304		distance = feature_collection[index_entry].get_distance(point_entry, self.__measurement_type);
305		if (distance < minimum_distance):
306		minimum_distance = distance;
307		index_nearest_feature = index_entry;
308
309		return (minimum_distance, index_nearest_feature);
310

annoviko / pyclustering

Push — master ( b2f733...eb13df )

birch.__extract_features() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like