Completed
Push — master ( 6ed467...f3f897 )
by Andrei
01:29
created

birch.get_cluster_encoding()   A

Complexity

Conditions 1

Size

Total Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
dl 0
loc 11
rs 9.4285
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: BIRCH
4
@details Implementation based on article:
5
         - T.Zhang, R.Ramakrishnan, M.Livny. BIRCH: An Efficient Data Clustering Method for Very Large Databases. 1996.
6
         
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2016
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
from pyclustering.utils import linear_sum, square_sum;
30
31
from pyclustering.cluster.encoder import type_encoding;
32
33
from pyclustering.container.cftree import cftree, cfentry, measurement_type;
34
35
from copy import copy;
0 ignored issues
show
Unused Code introduced by
Unused copy imported from copy
Loading history...
36
37
38
class birch:
39
    """!
40
    @brief Class represents clustering algorithm BIRCH.
41
    
42
    Example:
43
    @code
44
        # sample for cluster analysis (represented by list)
45
        sample = read_sample(path_to_sample);
46
        
47
        # create object of birch that uses CCORE for processing
48
        birch_instance = birch(sample, 2, 5, 5, 0.05, measurement_type.CENTROID_EUCLIDIAN_DISTANCE, 200, True);
49
        
50
        # cluster analysis
51
        birch_instance.process();
52
        
53
        # obtain results of clustering
54
        clusters = birch_instance.get_clusters();
55
    @endcode
56
    
57
    """
58
    
59
    def __init__(self, data, number_clusters, branching_factor = 5, max_node_entries = 5, initial_diameter = 0.1, type_measurement = measurement_type.CENTROID_EUCLIDIAN_DISTANCE, entry_size_limit = 200, diameter_multiplier = 1.5, outlier_detector = 0, ccore = False):
60
        """!
61
        @brief Constructor of clustering algorithm BIRCH.
62
        
63
        @param[in] data (list): Input data presented as list of points (objects), where each point should be represented by list or tuple.
64
        @param[in] number_clusters (uint): Number of clusters that should be allocated.
65
        @param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree.
66
        @param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree.
67
        @param[in] initial_diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded.
68
        @param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics.
69
        @param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded during creation then diameter is increased and CF-Tree is rebuilt.
70
        @param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when entry_size_limit is exceeded.
71
        @param[in] outlier_detector (uint): Minimum number of data points that should be contained by node to be considered as non-outlier node.
72
        @param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving the problem.
73
        
74
        @remark Despite eight arguments only the first two is mandatory, others can be ommitted. In this case default values are used for instance creation.
75
        
76
        Example:
77
        @code
78
            birch_instance1 = birch(sample1, 2);    # two clusters should be allocated
79
            birch_instance2 = birch(sample2, 5);    # five clusters should be allocated
80
            
81
            # three clusters should be allocated, but also each leaf node can have maximum 5 
82
            # entries and each entry can have maximum 5 descriptors with initial diameter 0.05.
83
            birch_instance3 = birch(sample3, 3, 5, 5, 0.05);
84
        @endcode
85
        
86
        """
87
        
88
        self.__pointer_data = data;
89
        self.__number_clusters = number_clusters;
90
        
91
        self.__measurement_type = type_measurement;
92
        self.__entry_size_limit = entry_size_limit;
93
        self.__diameter_multiplier = diameter_multiplier;
94
        self.__outlier_detector = outlier_detector;
95
        self.__ccore = ccore;
96
        
97
        self.__features = None;
98
        self.__tree = cftree(branching_factor, max_node_entries, initial_diameter, type_measurement);
99
        
100
        self.__clusters = [];
101
        self.__noise = [];
102
103
104
    def process(self):
105
        """!
106
        @brief Performs cluster analysis in line with rules of BIRCH algorithm.
107
        
108
        @remark Results of clustering can be obtained using corresponding gets methods.
109
        
110
        @see get_clusters()
111
        @see get_noise()
112
        
113
        """
114
        
115
        self.__insert_data();
116
        self.__extract_features();
117
118
        # in line with specification modify hierarchical algorithm should be used for further clustering
119
        current_number_clusters = len(self.__features);
120
        
121
        while (current_number_clusters > self.__number_clusters):
122
            indexes = self.__find_nearest_cluster_features();
123
            
124
            self.__features[indexes[0]] += self.__features[indexes[1]];
125
            self.__features.pop(indexes[1]);
126
            
127
            current_number_clusters = len(self.__features);
128
            
129
        # decode data
130
        self.__decode_data();
131
    
132
    
133
    def get_clusters(self):
134
        """!
135
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
136
        
137
        @remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
138
        
139
        @return (list) List of allocated clusters.
140
        
141
        @see process()
142
        @see get_noise()
143
        
144
        """
145
        
146
        return self.__clusters;
147
    
148
    
149
    def get_noise(self):
150
        """!
151
        @brief Returns allocated noise.
152
        
153
        @remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
154
        
155
        @return (list) List of indexes that are marked as a noise.
156
        
157
        @see process()
158
        @see get_clusters()
159
        
160
        """
161
        
162
        return self.__noise;
163
164
165
    def get_cluster_encoding(self):
166
        """!
167
        @brief Returns clustering result representation type that indicate how clusters are encoded.
168
        
169
        @return (type_encoding) Clustering result representation.
170
        
171
        @see get_clusters()
172
        
173
        """
174
        
175
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
176
177
178
    def __extract_features(self):
179
        """!
180
        @brief Extracts features and outlier features from CF-tree cluster.
181
        
182
        """
183
        
184
        self.__features = [];
185
        self.__outlier_features = [];
186
        
187
        if (len(self.__tree.leafes) == 1):
188
            # parameters are too general, copy all entries
189
            for entry in self.__tree.leafes[0].entries:
190
                if (self.__outlier_detector < entry.number_points):
191
                    self.__features.append(entry);
192
                else:
193
                    self.__outlier_features.append(entry);
194
        else:
195
            # copy all leaf clustering features
196
            for node in self.__tree.leafes:
197
                if (self.__outlier_detector < node.feature.number_points):
198
                    self.__features.append(node.feature);
199 View Code Duplication
                else:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
200
                    self.__outlier_features.append(node.feature);
201
    
202
    
203
    def __decode_data(self):
204
        """!
205
        @brief Decodes data from CF-tree features.
206
        
207
        """
208
        
209
        self.__clusters = [ [] for _ in range(self.__number_clusters) ];
210
        self.__noise = [];
211
        
212
        for index_point in range(0, len(self.__pointer_data)):
213
            (cluster_distance, cluster_index) = self.__get_nearest_feature(self.__pointer_data[index_point], self.__features);
214
            (outlier_distance, _) = self.__get_nearest_feature(self.__pointer_data[index_point], self.__outlier_features);
215
            
216
            if (cluster_distance < outlier_distance):
217
                self.__clusters[cluster_index].append(index_point);
218
            else:
219
                self.__noise.append(index_point);
220
    
221
    
222
    def __insert_data(self):
223
        """!
224
        @brief Inserts input data to the tree.
225
        
226
        @remark If number of maximum number of entries is exceeded than diameter is increased and tree is rebuilt.
227
        
228
        """
229
        
230
        for index_point in range(0, len(self.__pointer_data)):
231
            point = self.__pointer_data[index_point];
232
            self.__tree.insert_cluster( [ point ] );
233
            
234
            if (self.__tree.amount_entries > self.__entry_size_limit):
235
                self.__tree = self.__rebuild_tree(index_point);
236
        
237
        #self.__tree.show_feature_destibution(self.__pointer_data);
238
    
239
    
240
    def __rebuild_tree(self, index_point):
241
        """!
242
        @brief Rebuilt tree in case of maxumum number of entries is exceeded.
243
        
244
        @param[in] index_point (uint): Index of point that is used as end point of re-building.
245
        
246
        @return (cftree) Rebuilt tree with encoded points till specified point from input data space.
247
        
248
        """
249
        
250
        rebuild_result = False;
251
        increased_diameter = self.__tree.threshold * self.__diameter_multiplier;
252
        
253
        tree = None;
254
        
255
        while(rebuild_result is False):
256
            # increase diameter and rebuild tree
257
            if (increased_diameter == 0.0):
258
                increased_diameter = 1.0;
259
            
260
            # build tree with update parameters
261
            tree = cftree(self.__tree.branch_factor, self.__tree.max_entries, increased_diameter, self.__tree.type_measurement);
262
            
263
            for index_point in range(0, index_point + 1):
264
                point = self.__pointer_data[index_point];
265
                tree.insert_cluster([point]);
266
            
267
                if (tree.amount_entries > self.__entry_size_limit):
268
                    increased_diameter *= self.__diameter_multiplier;
269
                    continue;
270
            
271
            # Re-build is successful.
272
            rebuild_result = True;
273
        
274
        return tree;
275
    
276
    
277
    def __find_nearest_cluster_features(self):
278
        """!
279
        @brief Find pair of nearest CF entries.
280
        
281
        @return (list) List of two nearest enties that are represented by list [index_point1, index_point2].
282
        
283
        """
284
        
285
        minimum_distance = float("Inf");
286
        index1 = 0;
287
        index2 = 0;
288
        
289
        for index_candidate1 in range(0, len(self.__features)):
290
            feature1 = self.__features[index_candidate1];
291
            for index_candidate2 in range(index_candidate1 + 1, len(self.__features)):
292
                feature2 = self.__features[index_candidate2];
293
                
294
                distance = feature1.get_distance(feature2, self.__measurement_type);
295
                if (distance < minimum_distance):
296
                    minimum_distance = distance;
297
                    
298
                    index1 = index_candidate1;
299
                    index2 = index_candidate2;
300
        
301
        return [index1, index2];
302
    
303
    
304
    def __get_nearest_feature(self, point, feature_collection):
305
        """!
306
        @brief Find nearest entry for specified point.
307
        
308
        @param[in] point (list): Pointer to point from input dataset.
309
        @param[in] feature_collection (list): Feature collection that is used for obtaining nearest feature for the specified point.
310
        
311
        @return (double, uint) Tuple of distance to nearest entry to the specified point and index of that entry.
312
        
313
        """
314
        
315
        minimum_distance = float("Inf");
316
        index_nearest_feature = -1;
317
        
318
        for index_entry in range(0, len(feature_collection)):
319
            point_entry = cfentry(1, linear_sum([ point ]), square_sum([ point ]));
320
            
321
            distance = feature_collection[index_entry].get_distance(point_entry, self.__measurement_type);
322
            if (distance < minimum_distance):
323
                minimum_distance = distance;
324
                index_nearest_feature = index_entry;
325
                
326
        return (minimum_distance, index_nearest_feature);
327