Completed
Push — master ( b2f733...eb13df )
by Andrei
02:02
created

birch.__extract_features()   B

Complexity

Conditions 6

Size

Total Lines 23

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 6
dl 0
loc 23
rs 7.6949
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: BIRCH
4
@details Implementation based on article:
5
         - T.Zhang, R.Ramakrishnan, M.Livny. BIRCH: An Efficient Data Clustering Method for Very Large Databases. 1996.
6
         
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2016
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
from pyclustering.utils import linear_sum, square_sum;
29
30
from pyclustering.container.cftree import cftree, cfentry, measurement_type;
31
32
from copy import copy;
0 ignored issues
show
Unused Code introduced by
Unused copy imported from copy
Loading history...
33
34
class birch:
35
    """!
36
    @brief Class represents clustering algorithm BIRCH.
37
    
38
    Example:
39
    @code
40
        # sample for cluster analysis (represented by list)
41
        sample = read_sample(path_to_sample);
42
        
43
        # create object of birch that uses CCORE for processing
44
        birch_instance = birch(sample, 2, 5, 5, 0.05, measurement_type.CENTROID_EUCLIDIAN_DISTANCE, 200, True);
45
        
46
        # cluster analysis
47
        birch_instance.process();
48
        
49
        # obtain results of clustering
50
        clusters = birch_instance.get_clusters();
51
    @endcode
52
    
53
    """
54
    
55
    def __init__(self, data, number_clusters, branching_factor = 5, max_node_entries = 5, initial_diameter = 0.1, type_measurement = measurement_type.CENTROID_EUCLIDIAN_DISTANCE, entry_size_limit = 200, diameter_multiplier = 1.5, outlier_detector = 0, ccore = False):
56
        """!
57
        @brief Constructor of clustering algorithm BIRCH.
58
        
59
        @param[in] data (list): Input data presented as list of points (objects), where each point should be represented by list or tuple.
60
        @param[in] number_clusters (uint): Number of clusters that should be allocated.
61
        @param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree.
62
        @param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree.
63
        @param[in] initial_diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded.
64
        @param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics.
65
        @param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded during creation then diameter is increased and CF-Tree is rebuilt.
66
        @param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when entry_size_limit is exceeded.
67
        @param[in] outlier_detector (uint): Minimum number of data points that should be contained by node to be considered as non-outlier node.
68
        @param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving the problem.
69
        
70
        @remark Despite eight arguments only the first two is mandatory, others can be ommitted. In this case default values are used for instance creation.
71
        
72
        Example:
73
        @code
74
            birch_instance1 = birch(sample1, 2);    # two clusters should be allocated
75
            birch_instance2 = birch(sample2, 5);    # five clusters should be allocated
76
            
77
            # three clusters should be allocated, but also each leaf node can have maximum 5 
78
            # entries and each entry can have maximum 5 descriptors with initial diameter 0.05.
79
            birch_instance3 = birch(sample3, 3, 5, 5, 0.05);
80
        @endcode
81
        
82
        """
83
        
84
        self.__pointer_data = data;
85
        self.__number_clusters = number_clusters;
86
        
87
        self.__measurement_type = type_measurement;
88
        self.__entry_size_limit = entry_size_limit;
89
        self.__diameter_multiplier = diameter_multiplier;
90
        self.__outlier_detector = outlier_detector;
91
        self.__ccore = ccore;
92
        
93
        self.__features = None;
94
        self.__tree = cftree(branching_factor, max_node_entries, initial_diameter, type_measurement);
95
        
96
        self.__clusters = [];
97
        self.__noise = [];
98
99
100
    def process(self):
101
        """!
102
        @brief Performs cluster analysis in line with rules of BIRCH algorithm.
103
        
104
        @remark Results of clustering can be obtained using corresponding gets methods.
105
        
106
        @see get_clusters()
107
        @see get_noise()
108
        
109
        """
110
        
111
        self.__insert_data();
112
        self.__extract_features();
113
114
        # in line with specification modify hierarchical algorithm should be used for further clustering
115
        current_number_clusters = len(self.__features);
116
        
117
        while (current_number_clusters > self.__number_clusters):
118
            indexes = self.__find_nearest_cluster_features();
119
            
120
            self.__features[indexes[0]] += self.__features[indexes[1]];
121
            self.__features.pop(indexes[1]);
122
            
123
            current_number_clusters = len(self.__features);
124
            
125
        # decode data
126
        self.__decode_data();
127
    
128
    
129
    def get_clusters(self):
130
        """!
131
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
132
        
133
        @remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
134
        
135
        @return (list) List of allocated clusters.
136
        
137
        @see process()
138
        @see get_noise()
139
        
140
        """
141
        
142
        return self.__clusters;
143
    
144
    
145
    def get_noise(self):
146
        """!
147
        @brief Returns allocated noise.
148
        
149
        @remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
150
        
151
        @return (list) List of indexes that are marked as a noise.
152
        
153
        @see process()
154
        @see get_clusters()
155
        
156
        """
157
        
158
        return self.__noise;
159
    
160
    
161
    def __extract_features(self):
162
        """!
163
        @brief Extracts features and outlier features from CF-tree cluster.
164
        
165
        """
166
        
167
        self.__features = [];
168
        self.__outlier_features = [];
169
        
170
        if (len(self.__tree.leafes) == 1):
171
            # parameters are too general, copy all entries
172
            for entry in self.__tree.leafes[0].entries:
173
                if (self.__outlier_detector < entry.number_points):
174
                    self.__features.append(entry);
175
                else:
176
                    self.__outlier_features.append(entry);
177
        else:
178
            # copy all leaf clustering features
179
            for node in self.__tree.leafes:
180
                if (self.__outlier_detector < node.feature.number_points):
181
                    self.__features.append(node.feature);
182
                else:
183
                    self.__outlier_features.append(node.feature);
184
    
185
    
186
    def __decode_data(self):
187
        """!
188
        @brief Decodes data from CF-tree features.
189
        
190
        """
191
        
192
        self.__clusters = [ [] for _ in range(self.__number_clusters) ];
193
        self.__noise = [];
194
        
195
        for index_point in range(0, len(self.__pointer_data)):
196
            (cluster_distance, cluster_index) = self.__get_nearest_feature(self.__pointer_data[index_point], self.__features);
197
            (outlier_distance, _) = self.__get_nearest_feature(self.__pointer_data[index_point], self.__outlier_features);
198
            
199 View Code Duplication
            if (cluster_distance < outlier_distance):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
200
                self.__clusters[cluster_index].append(index_point);
201
            else:
202
                self.__noise.append(index_point);
203
    
204
    
205
    def __insert_data(self):
206
        """!
207
        @brief Inserts input data to the tree.
208
        
209
        @remark If number of maximum number of entries is exceeded than diameter is increased and tree is rebuilt.
210
        
211
        """
212
        
213
        for index_point in range(0, len(self.__pointer_data)):
214
            point = self.__pointer_data[index_point];
215
            self.__tree.insert_cluster( [ point ] );
216
            
217
            if (self.__tree.amount_entries > self.__entry_size_limit):
218
                self.__tree = self.__rebuild_tree(index_point);
219
        
220
        #self.__tree.show_feature_destibution(self.__pointer_data);
221
    
222
    
223
    def __rebuild_tree(self, index_point):
224
        """!
225
        @brief Rebuilt tree in case of maxumum number of entries is exceeded.
226
        
227
        @param[in] index_point (uint): Index of point that is used as end point of re-building.
228
        
229
        @return (cftree) Rebuilt tree with encoded points till specified point from input data space.
230
        
231
        """
232
        
233
        rebuild_result = False;
234
        increased_diameter = self.__tree.threshold * self.__diameter_multiplier;
235
        
236
        tree = None;
237
        
238
        while(rebuild_result is False):
239
            # increase diameter and rebuild tree
240
            if (increased_diameter == 0.0):
241
                increased_diameter = 1.0;
242
            
243
            # build tree with update parameters
244
            tree = cftree(self.__tree.branch_factor, self.__tree.max_entries, increased_diameter, self.__tree.type_measurement);
245
            
246
            for index_point in range(0, index_point + 1):
247
                point = self.__pointer_data[index_point];
248
                tree.insert_cluster([point]);
249
            
250
                if (tree.amount_entries > self.__entry_size_limit):
251
                    increased_diameter *= self.__diameter_multiplier;
252
                    continue;
253
            
254
            # Re-build is successful.
255
            rebuild_result = True;
256
        
257
        return tree;
258
    
259
    
260
    def __find_nearest_cluster_features(self):
261
        """!
262
        @brief Find pair of nearest CF entries.
263
        
264
        @return (list) List of two nearest enties that are represented by list [index_point1, index_point2].
265
        
266
        """
267
        
268
        minimum_distance = float("Inf");
269
        index1 = 0;
270
        index2 = 0;
271
        
272
        for index_candidate1 in range(0, len(self.__features)):
273
            feature1 = self.__features[index_candidate1];
274
            for index_candidate2 in range(index_candidate1 + 1, len(self.__features)):
275
                feature2 = self.__features[index_candidate2];
276
                
277
                distance = feature1.get_distance(feature2, self.__measurement_type);
278
                if (distance < minimum_distance):
279
                    minimum_distance = distance;
280
                    
281
                    index1 = index_candidate1;
282
                    index2 = index_candidate2;
283
        
284
        return [index1, index2];
285
    
286
    
287
    def __get_nearest_feature(self, point, feature_collection):
288
        """!
289
        @brief Find nearest entry for specified point.
290
        
291
        @param[in] point (list): Pointer to point from input dataset.
292
        @param[in] feature_collection (list): Feature collection that is used for obtaining nearest feature for the specified point.
293
        
294
        @return (double, uint) Tuple of distance to nearest entry to the specified point and index of that entry.
295
        
296
        """
297
        
298
        minimum_distance = float("Inf");
299
        index_nearest_feature = -1;
300
        
301
        for index_entry in range(0, len(feature_collection)):
302
            point_entry = cfentry(1, linear_sum([ point ]), square_sum([ point ]));
303
            
304
            distance = feature_collection[index_entry].get_distance(point_entry, self.__measurement_type);
305
            if (distance < minimum_distance):
306
                minimum_distance = distance;
307
                index_nearest_feature = index_entry;
308
                
309
        return (minimum_distance, index_nearest_feature);
310