birch.get_noise() - Code Metrics - Inspection of "[pyclustering.cluster.birch] Remove feature to ext..." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( ddfd8a...6ab844 )

by Andrei

created 2017-04-19 09:58 UTC

birch.get_noise() A

↳ Parent: birch

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
dl	0
loc	14
rs	9.4285
c	0
b	0
f	0

"""!

@brief Cluster analysis algorithm: BIRCH
@details Implementation based on article:
         - T.Zhang, R.Ramakrishnan, M.Livny. BIRCH: An Efficient Data Clustering Method for Very Large Databases. 1996.
         
@authors Andrei Novikov ([email protected])
@date 2014-2017
@copyright GNU Public License

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""


from pyclustering.utils import linear_sum, square_sum;

from pyclustering.cluster.encoder import type_encoding;

from pyclustering.container.cftree import cftree, cfentry, measurement_type;


class birch:
    """!
    @brief Class represents clustering algorithm BIRCH.
    
    Example:
    @code
        # sample for cluster analysis (represented by list)
        sample = read_sample(path_to_sample);
        
        # create object of birch that uses CCORE for processing
        birch_instance = birch(sample, 2, 5, 5, 0.05, measurement_type.CENTROID_EUCLIDIAN_DISTANCE, 200, True);
        
        # cluster analysis
        birch_instance.process();
        
        # obtain results of clustering
        clusters = birch_instance.get_clusters();
    @endcode
    
    """
    
    def __init__(self, data, number_clusters, branching_factor = 5, max_node_entries = 5, initial_diameter = 0.1, type_measurement = measurement_type.CENTROID_EUCLIDIAN_DISTANCE, entry_size_limit = 200, diameter_multiplier = 1.5, ccore = False):
        """!
        @brief Constructor of clustering algorithm BIRCH.
        
        @param[in] data (list): Input data presented as list of points (objects), where each point should be represented by list or tuple.
        @param[in] number_clusters (uint): Number of clusters that should be allocated.
        @param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree.
        @param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree.
        @param[in] initial_diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded.
        @param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics.
        @param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded during creation then diameter is increased and CF-Tree is rebuilt.
        @param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when entry_size_limit is exceeded.
        @param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving the problem.
        
        @remark Despite eight arguments only the first two is mandatory, others can be ommitted. In this case default values are used for instance creation.
        
        Example:
        @code
            birch_instance1 = birch(sample1, 2);    # two clusters should be allocated
            birch_instance2 = birch(sample2, 5);    # five clusters should be allocated
            
            # three clusters should be allocated, but also each leaf node can have maximum 5 
            # entries and each entry can have maximum 5 descriptors with initial diameter 0.05.
            birch_instance3 = birch(sample3, 3, 5, 5, 0.05);
        @endcode
        
        """
        
        self.__pointer_data = data;
        self.__number_clusters = number_clusters;
        
        self.__measurement_type = type_measurement;
        self.__entry_size_limit = entry_size_limit;
        self.__diameter_multiplier = diameter_multiplier;
        self.__ccore = ccore;
        
        self.__features = None;
        self.__tree = cftree(branching_factor, max_node_entries, initial_diameter, type_measurement);
        
        self.__clusters = [];
        self.__noise = [];


    def process(self):
        """!
        @brief Performs cluster analysis in line with rules of BIRCH algorithm.
        
        @remark Results of clustering can be obtained using corresponding gets methods.
        
        @see get_clusters()
        
        """
        
        self.__insert_data();
        self.__extract_features();

        # in line with specification modify hierarchical algorithm should be used for further clustering
        current_number_clusters = len(self.__features);
        
        while (current_number_clusters > self.__number_clusters):
            indexes = self.__find_nearest_cluster_features();
            
            self.__features[indexes[0]] += self.__features[indexes[1]];
            self.__features.pop(indexes[1]);
            
            current_number_clusters = len(self.__features);
            
        # decode data
        self.__decode_data();
    
    
    def get_clusters(self):
        """!
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
        
        @remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
        
        @return (list) List of allocated clusters.
        
        @see process()
        @see get_noise()
        
        """
        
        return self.__clusters;


    def get_cluster_encoding(self):
        """!
        @brief Returns clustering result representation type that indicate how clusters are encoded.
        
        @return (type_encoding) Clustering result representation.
        
        @see get_clusters()
        
        """
        
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;


    def __extract_features(self):
        """!
        @brief Extracts features from CF-tree cluster.
        
        """
        
        self.__features = [];
        
        if (len(self.__tree.leafes) == 1):
            # parameters are too general, copy all entries
            for entry in self.__tree.leafes[0].entries:
                self.__features.append(entry);

        else:
            # copy all leaf clustering features
            for node in self.__tree.leafes:
                self.__features.append(node.feature);
    
    
    def __decode_data(self):
        """!
        @brief Decodes data from CF-tree features.
        
        """
        
        self.__clusters = [ [] for _ in range(self.__number_clusters) ];
        self.__noise = [];
        
        for index_point in range(0, len(self.__pointer_data)):
            (_, cluster_index) = self.__get_nearest_feature(self.__pointer_data[index_point], self.__features);
            
            self.__clusters[cluster_index].append(index_point);
    
    
    def __insert_data(self):
        """!
        @brief Inserts input data to the tree.
        
        @remark If number of maximum number of entries is exceeded than diameter is increased and tree is rebuilt.
        
        """
        
        for index_point in range(0, len(self.__pointer_data)):

            point = self.__pointer_data[index_point];
            self.__tree.insert_cluster( [ point ] );
            
            if (self.__tree.amount_entries > self.__entry_size_limit):
                self.__tree = self.__rebuild_tree(index_point);
        
        #self.__tree.show_feature_destibution(self.__pointer_data);
    
    
    def __rebuild_tree(self, index_point):
        """!
        @brief Rebuilt tree in case of maxumum number of entries is exceeded.
        
        @param[in] index_point (uint): Index of point that is used as end point of re-building.
        
        @return (cftree) Rebuilt tree with encoded points till specified point from input data space.
        
        """
        
        rebuild_result = False;
        increased_diameter = self.__tree.threshold * self.__diameter_multiplier;
        
        tree = None;
        
        while(rebuild_result is False):
            # increase diameter and rebuild tree
            if (increased_diameter == 0.0):
                increased_diameter = 1.0;
            
            # build tree with update parameters
            tree = cftree(self.__tree.branch_factor, self.__tree.max_entries, increased_diameter, self.__tree.type_measurement);
            
            for index_point in range(0, index_point + 1):
                point = self.__pointer_data[index_point];
                tree.insert_cluster([point]);
            
                if (tree.amount_entries > self.__entry_size_limit):
                    increased_diameter *= self.__diameter_multiplier;
                    continue;
            
            # Re-build is successful.
            rebuild_result = True;
        
        return tree;
    
    
    def __find_nearest_cluster_features(self):
        """!
        @brief Find pair of nearest CF entries.
        
        @return (list) List of two nearest enties that are represented by list [index_point1, index_point2].
        
        """
        
        minimum_distance = float("Inf");
        index1 = 0;
        index2 = 0;
        
        for index_candidate1 in range(0, len(self.__features)):
            feature1 = self.__features[index_candidate1];
            for index_candidate2 in range(index_candidate1 + 1, len(self.__features)):
                feature2 = self.__features[index_candidate2];
                
                distance = feature1.get_distance(feature2, self.__measurement_type);
                if (distance < minimum_distance):
                    minimum_distance = distance;
                    
                    index1 = index_candidate1;
                    index2 = index_candidate2;
        
        return [index1, index2];
    
    
    def __get_nearest_feature(self, point, feature_collection):
        """!
        @brief Find nearest entry for specified point.
        
        @param[in] point (list): Pointer to point from input dataset.
        @param[in] feature_collection (list): Feature collection that is used for obtaining nearest feature for the specified point.
        
        @return (double, uint) Tuple of distance to nearest entry to the specified point and index of that entry.
        
        """
        
        minimum_distance = float("Inf");
        index_nearest_feature = -1;
        
        for index_entry in range(0, len(feature_collection)):
            point_entry = cfentry(1, linear_sum([ point ]), square_sum([ point ]));
            
            distance = feature_collection[index_entry].get_distance(point_entry, self.__measurement_type);
            if (distance < minimum_distance):
                minimum_distance = distance;
                index_nearest_feature = index_entry;
                
        return (minimum_distance, index_nearest_feature);


1		"""!
2
3		@brief Cluster analysis algorithm: BIRCH
4		@details Implementation based on article:
5		- T.Zhang, R.Ramakrishnan, M.Livny. BIRCH: An Efficient Data Clustering Method for Very Large Databases. 1996.
6
7		@authors Andrei Novikov ([email protected])
8		@date 2014-2017
9		@copyright GNU Public License
10
11		@cond GNU_PUBLIC_LICENSE
12		PyClustering is free software: you can redistribute it and/or modify
13		it under the terms of the GNU General Public License as published by
14		the Free Software Foundation, either version 3 of the License, or
15		(at your option) any later version.
16
17		PyClustering is distributed in the hope that it will be useful,
18		but WITHOUT ANY WARRANTY; without even the implied warranty of
19		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20		GNU General Public License for more details.
21
22		You should have received a copy of the GNU General Public License
23		along with this program. If not, see <http://www.gnu.org/licenses/>.
24		@endcond
25
26		"""
27
28
29		from pyclustering.utils import linear_sum, square_sum;
30
31		from pyclustering.cluster.encoder import type_encoding;
32
33		from pyclustering.container.cftree import cftree, cfentry, measurement_type;
34
35
36		class birch:
37		"""!
38		@brief Class represents clustering algorithm BIRCH.
39
40		Example:
41		@code
42		# sample for cluster analysis (represented by list)
43		sample = read_sample(path_to_sample);
44
45		# create object of birch that uses CCORE for processing
46		birch_instance = birch(sample, 2, 5, 5, 0.05, measurement_type.CENTROID_EUCLIDIAN_DISTANCE, 200, True);
47
48		# cluster analysis
49		birch_instance.process();
50
51		# obtain results of clustering
52		clusters = birch_instance.get_clusters();
53		@endcode
54
55		"""
56
57		def __init__(self, data, number_clusters, branching_factor = 5, max_node_entries = 5, initial_diameter = 0.1, type_measurement = measurement_type.CENTROID_EUCLIDIAN_DISTANCE, entry_size_limit = 200, diameter_multiplier = 1.5, ccore = False):
58		"""!
59		@brief Constructor of clustering algorithm BIRCH.
60
61		@param[in] data (list): Input data presented as list of points (objects), where each point should be represented by list or tuple.
62		@param[in] number_clusters (uint): Number of clusters that should be allocated.
63		@param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree.
64		@param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree.
65		@param[in] initial_diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded.
66		@param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics.
67		@param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded during creation then diameter is increased and CF-Tree is rebuilt.
68		@param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when entry_size_limit is exceeded.
69		@param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving the problem.
70
71		@remark Despite eight arguments only the first two is mandatory, others can be ommitted. In this case default values are used for instance creation.
72
73		Example:
74		@code
75		birch_instance1 = birch(sample1, 2); # two clusters should be allocated
76		birch_instance2 = birch(sample2, 5); # five clusters should be allocated
77
78		# three clusters should be allocated, but also each leaf node can have maximum 5
79		# entries and each entry can have maximum 5 descriptors with initial diameter 0.05.
80		birch_instance3 = birch(sample3, 3, 5, 5, 0.05);
81		@endcode
82
83		"""
84
85		self.__pointer_data = data;
86		self.__number_clusters = number_clusters;
87
88		self.__measurement_type = type_measurement;
89		self.__entry_size_limit = entry_size_limit;
90		self.__diameter_multiplier = diameter_multiplier;
91		self.__ccore = ccore;
92
93		self.__features = None;
94		self.__tree = cftree(branching_factor, max_node_entries, initial_diameter, type_measurement);
95
96		self.__clusters = [];
97		self.__noise = [];
98
99
100		def process(self):
101		"""!
102		@brief Performs cluster analysis in line with rules of BIRCH algorithm.
103
104		@remark Results of clustering can be obtained using corresponding gets methods.
105
106		@see get_clusters()
107
108		"""
109
110		self.__insert_data();
111		self.__extract_features();
112
113		# in line with specification modify hierarchical algorithm should be used for further clustering
114		current_number_clusters = len(self.__features);
115
116		while (current_number_clusters > self.__number_clusters):
117		indexes = self.__find_nearest_cluster_features();
118
119		self.__features[indexes[0]] += self.__features[indexes[1]];
120		self.__features.pop(indexes[1]);
121
122		current_number_clusters = len(self.__features);
123
124		# decode data
125		self.__decode_data();
126
127
128		def get_clusters(self):
129		"""!
130		@brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
131
132		@remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
133
134		@return (list) List of allocated clusters.
135
136		@see process()
137		@see get_noise()
138
139		"""
140
141		return self.__clusters;
142
143
144		def get_cluster_encoding(self):
145		"""!
146		@brief Returns clustering result representation type that indicate how clusters are encoded.
147
148		@return (type_encoding) Clustering result representation.
149
150		@see get_clusters()
151
152		"""
153
154		return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
155
156
157		def __extract_features(self):
158		"""!
159		@brief Extracts features from CF-tree cluster.
160
161		"""
162
163		self.__features = [];
164
165		if (len(self.__tree.leafes) == 1):
166		# parameters are too general, copy all entries
167		for entry in self.__tree.leafes[0].entries:
168		self.__features.append(entry);
169
170		else:
171		# copy all leaf clustering features
172		for node in self.__tree.leafes:
173		self.__features.append(node.feature);
174
175
176		def __decode_data(self):
177		"""!
178		@brief Decodes data from CF-tree features.
179
180		"""
181
182		self.__clusters = [ [] for _ in range(self.__number_clusters) ];
183		self.__noise = [];
184
185		for index_point in range(0, len(self.__pointer_data)):
186		(_, cluster_index) = self.__get_nearest_feature(self.__pointer_data[index_point], self.__features);
187
188		self.__clusters[cluster_index].append(index_point);
189
190
191		def __insert_data(self):
192		"""!
193		@brief Inserts input data to the tree.
194
195		@remark If number of maximum number of entries is exceeded than diameter is increased and tree is rebuilt.
196
197		"""
198
199	View Code Duplication	for index_point in range(0, len(self.__pointer_data)):
		0 ignored issues – show Duplication introduced 2017-04-19 10:01 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
200		point = self.__pointer_data[index_point];
201		self.__tree.insert_cluster( [ point ] );
202
203		if (self.__tree.amount_entries > self.__entry_size_limit):
204		self.__tree = self.__rebuild_tree(index_point);
205
206		#self.__tree.show_feature_destibution(self.__pointer_data);
207
208
209		def __rebuild_tree(self, index_point):
210		"""!
211		@brief Rebuilt tree in case of maxumum number of entries is exceeded.
212
213		@param[in] index_point (uint): Index of point that is used as end point of re-building.
214
215		@return (cftree) Rebuilt tree with encoded points till specified point from input data space.
216
217		"""
218
219		rebuild_result = False;
220		increased_diameter = self.__tree.threshold * self.__diameter_multiplier;
221
222		tree = None;
223
224		while(rebuild_result is False):
225		# increase diameter and rebuild tree
226		if (increased_diameter == 0.0):
227		increased_diameter = 1.0;
228
229		# build tree with update parameters
230		tree = cftree(self.__tree.branch_factor, self.__tree.max_entries, increased_diameter, self.__tree.type_measurement);
231
232		for index_point in range(0, index_point + 1):
233		point = self.__pointer_data[index_point];
234		tree.insert_cluster([point]);
235
236		if (tree.amount_entries > self.__entry_size_limit):
237		increased_diameter *= self.__diameter_multiplier;
238		continue;
239
240		# Re-build is successful.
241		rebuild_result = True;
242
243		return tree;
244
245
246		def __find_nearest_cluster_features(self):
247		"""!
248		@brief Find pair of nearest CF entries.
249
250		@return (list) List of two nearest enties that are represented by list [index_point1, index_point2].
251
252		"""
253
254		minimum_distance = float("Inf");
255		index1 = 0;
256		index2 = 0;
257
258		for index_candidate1 in range(0, len(self.__features)):
259		feature1 = self.__features[index_candidate1];
260		for index_candidate2 in range(index_candidate1 + 1, len(self.__features)):
261		feature2 = self.__features[index_candidate2];
262
263		distance = feature1.get_distance(feature2, self.__measurement_type);
264		if (distance < minimum_distance):
265		minimum_distance = distance;
266
267		index1 = index_candidate1;
268		index2 = index_candidate2;
269
270		return [index1, index2];
271
272
273		def __get_nearest_feature(self, point, feature_collection):
274		"""!
275		@brief Find nearest entry for specified point.
276
277		@param[in] point (list): Pointer to point from input dataset.
278		@param[in] feature_collection (list): Feature collection that is used for obtaining nearest feature for the specified point.
279
280		@return (double, uint) Tuple of distance to nearest entry to the specified point and index of that entry.
281
282		"""
283
284		minimum_distance = float("Inf");
285		index_nearest_feature = -1;
286
287		for index_entry in range(0, len(feature_collection)):
288		point_entry = cfentry(1, linear_sum([ point ]), square_sum([ point ]));
289
290		distance = feature_collection[index_entry].get_distance(point_entry, self.__measurement_type);
291		if (distance < minimum_distance):
292		minimum_distance = distance;
293		index_nearest_feature = index_entry;
294
295		return (minimum_distance, index_nearest_feature);
296

annoviko / pyclustering

Push — master ( ddfd8a...6ab844 )

birch.get_noise() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like