Completed
Push — master ( ddfd8a...6ab844 )
by Andrei
04:13
created

birch.get_noise()   A

Complexity

Conditions 1

Size

Total Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
dl 0
loc 14
rs 9.4285
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: BIRCH
4
@details Implementation based on article:
5
         - T.Zhang, R.Ramakrishnan, M.Livny. BIRCH: An Efficient Data Clustering Method for Very Large Databases. 1996.
6
         
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2017
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
from pyclustering.utils import linear_sum, square_sum;
30
31
from pyclustering.cluster.encoder import type_encoding;
32
33
from pyclustering.container.cftree import cftree, cfentry, measurement_type;
34
35
36
class birch:
37
    """!
38
    @brief Class represents clustering algorithm BIRCH.
39
    
40
    Example:
41
    @code
42
        # sample for cluster analysis (represented by list)
43
        sample = read_sample(path_to_sample);
44
        
45
        # create object of birch that uses CCORE for processing
46
        birch_instance = birch(sample, 2, 5, 5, 0.05, measurement_type.CENTROID_EUCLIDIAN_DISTANCE, 200, True);
47
        
48
        # cluster analysis
49
        birch_instance.process();
50
        
51
        # obtain results of clustering
52
        clusters = birch_instance.get_clusters();
53
    @endcode
54
    
55
    """
56
    
57
    def __init__(self, data, number_clusters, branching_factor = 5, max_node_entries = 5, initial_diameter = 0.1, type_measurement = measurement_type.CENTROID_EUCLIDIAN_DISTANCE, entry_size_limit = 200, diameter_multiplier = 1.5, ccore = False):
58
        """!
59
        @brief Constructor of clustering algorithm BIRCH.
60
        
61
        @param[in] data (list): Input data presented as list of points (objects), where each point should be represented by list or tuple.
62
        @param[in] number_clusters (uint): Number of clusters that should be allocated.
63
        @param[in] branching_factor (uint): Maximum number of successor that might be contained by each non-leaf node in CF-Tree.
64
        @param[in] max_node_entries (uint): Maximum number of entries that might be contained by each leaf node in CF-Tree.
65
        @param[in] initial_diameter (double): Initial diameter that used for CF-Tree construction, it can be increase if entry_size_limit is exceeded.
66
        @param[in] type_measurement (measurement_type): Type measurement used for calculation distance metrics.
67
        @param[in] entry_size_limit (uint): Maximum number of entries that can be stored in CF-Tree, if it is exceeded during creation then diameter is increased and CF-Tree is rebuilt.
68
        @param[in] diameter_multiplier (double): Multiplier that is used for increasing diameter when entry_size_limit is exceeded.
69
        @param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving the problem.
70
        
71
        @remark Despite eight arguments only the first two is mandatory, others can be ommitted. In this case default values are used for instance creation.
72
        
73
        Example:
74
        @code
75
            birch_instance1 = birch(sample1, 2);    # two clusters should be allocated
76
            birch_instance2 = birch(sample2, 5);    # five clusters should be allocated
77
            
78
            # three clusters should be allocated, but also each leaf node can have maximum 5 
79
            # entries and each entry can have maximum 5 descriptors with initial diameter 0.05.
80
            birch_instance3 = birch(sample3, 3, 5, 5, 0.05);
81
        @endcode
82
        
83
        """
84
        
85
        self.__pointer_data = data;
86
        self.__number_clusters = number_clusters;
87
        
88
        self.__measurement_type = type_measurement;
89
        self.__entry_size_limit = entry_size_limit;
90
        self.__diameter_multiplier = diameter_multiplier;
91
        self.__ccore = ccore;
92
        
93
        self.__features = None;
94
        self.__tree = cftree(branching_factor, max_node_entries, initial_diameter, type_measurement);
95
        
96
        self.__clusters = [];
97
        self.__noise = [];
98
99
100
    def process(self):
101
        """!
102
        @brief Performs cluster analysis in line with rules of BIRCH algorithm.
103
        
104
        @remark Results of clustering can be obtained using corresponding gets methods.
105
        
106
        @see get_clusters()
107
        
108
        """
109
        
110
        self.__insert_data();
111
        self.__extract_features();
112
113
        # in line with specification modify hierarchical algorithm should be used for further clustering
114
        current_number_clusters = len(self.__features);
115
        
116
        while (current_number_clusters > self.__number_clusters):
117
            indexes = self.__find_nearest_cluster_features();
118
            
119
            self.__features[indexes[0]] += self.__features[indexes[1]];
120
            self.__features.pop(indexes[1]);
121
            
122
            current_number_clusters = len(self.__features);
123
            
124
        # decode data
125
        self.__decode_data();
126
    
127
    
128
    def get_clusters(self):
129
        """!
130
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
131
        
132
        @remark Allocated noise can be returned only after data processing (use method process() before). Otherwise empty list is returned.
133
        
134
        @return (list) List of allocated clusters.
135
        
136
        @see process()
137
        @see get_noise()
138
        
139
        """
140
        
141
        return self.__clusters;
142
143
144
    def get_cluster_encoding(self):
145
        """!
146
        @brief Returns clustering result representation type that indicate how clusters are encoded.
147
        
148
        @return (type_encoding) Clustering result representation.
149
        
150
        @see get_clusters()
151
        
152
        """
153
        
154
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
155
156
157
    def __extract_features(self):
158
        """!
159
        @brief Extracts features from CF-tree cluster.
160
        
161
        """
162
        
163
        self.__features = [];
164
        
165
        if (len(self.__tree.leafes) == 1):
166
            # parameters are too general, copy all entries
167
            for entry in self.__tree.leafes[0].entries:
168
                self.__features.append(entry);
169
170
        else:
171
            # copy all leaf clustering features
172
            for node in self.__tree.leafes:
173
                self.__features.append(node.feature);
174
    
175
    
176
    def __decode_data(self):
177
        """!
178
        @brief Decodes data from CF-tree features.
179
        
180
        """
181
        
182
        self.__clusters = [ [] for _ in range(self.__number_clusters) ];
183
        self.__noise = [];
184
        
185
        for index_point in range(0, len(self.__pointer_data)):
186
            (_, cluster_index) = self.__get_nearest_feature(self.__pointer_data[index_point], self.__features);
187
            
188
            self.__clusters[cluster_index].append(index_point);
189
    
190
    
191
    def __insert_data(self):
192
        """!
193
        @brief Inserts input data to the tree.
194
        
195
        @remark If number of maximum number of entries is exceeded than diameter is increased and tree is rebuilt.
196
        
197
        """
198
        
199 View Code Duplication
        for index_point in range(0, len(self.__pointer_data)):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
200
            point = self.__pointer_data[index_point];
201
            self.__tree.insert_cluster( [ point ] );
202
            
203
            if (self.__tree.amount_entries > self.__entry_size_limit):
204
                self.__tree = self.__rebuild_tree(index_point);
205
        
206
        #self.__tree.show_feature_destibution(self.__pointer_data);
207
    
208
    
209
    def __rebuild_tree(self, index_point):
210
        """!
211
        @brief Rebuilt tree in case of maxumum number of entries is exceeded.
212
        
213
        @param[in] index_point (uint): Index of point that is used as end point of re-building.
214
        
215
        @return (cftree) Rebuilt tree with encoded points till specified point from input data space.
216
        
217
        """
218
        
219
        rebuild_result = False;
220
        increased_diameter = self.__tree.threshold * self.__diameter_multiplier;
221
        
222
        tree = None;
223
        
224
        while(rebuild_result is False):
225
            # increase diameter and rebuild tree
226
            if (increased_diameter == 0.0):
227
                increased_diameter = 1.0;
228
            
229
            # build tree with update parameters
230
            tree = cftree(self.__tree.branch_factor, self.__tree.max_entries, increased_diameter, self.__tree.type_measurement);
231
            
232
            for index_point in range(0, index_point + 1):
233
                point = self.__pointer_data[index_point];
234
                tree.insert_cluster([point]);
235
            
236
                if (tree.amount_entries > self.__entry_size_limit):
237
                    increased_diameter *= self.__diameter_multiplier;
238
                    continue;
239
            
240
            # Re-build is successful.
241
            rebuild_result = True;
242
        
243
        return tree;
244
    
245
    
246
    def __find_nearest_cluster_features(self):
247
        """!
248
        @brief Find pair of nearest CF entries.
249
        
250
        @return (list) List of two nearest enties that are represented by list [index_point1, index_point2].
251
        
252
        """
253
        
254
        minimum_distance = float("Inf");
255
        index1 = 0;
256
        index2 = 0;
257
        
258
        for index_candidate1 in range(0, len(self.__features)):
259
            feature1 = self.__features[index_candidate1];
260
            for index_candidate2 in range(index_candidate1 + 1, len(self.__features)):
261
                feature2 = self.__features[index_candidate2];
262
                
263
                distance = feature1.get_distance(feature2, self.__measurement_type);
264
                if (distance < minimum_distance):
265
                    minimum_distance = distance;
266
                    
267
                    index1 = index_candidate1;
268
                    index2 = index_candidate2;
269
        
270
        return [index1, index2];
271
    
272
    
273
    def __get_nearest_feature(self, point, feature_collection):
274
        """!
275
        @brief Find nearest entry for specified point.
276
        
277
        @param[in] point (list): Pointer to point from input dataset.
278
        @param[in] feature_collection (list): Feature collection that is used for obtaining nearest feature for the specified point.
279
        
280
        @return (double, uint) Tuple of distance to nearest entry to the specified point and index of that entry.
281
        
282
        """
283
        
284
        minimum_distance = float("Inf");
285
        index_nearest_feature = -1;
286
        
287
        for index_entry in range(0, len(feature_collection)):
288
            point_entry = cfentry(1, linear_sum([ point ]), square_sum([ point ]));
289
            
290
            distance = feature_collection[index_entry].get_distance(point_entry, self.__measurement_type);
291
            if (distance < minimum_distance):
292
                minimum_distance = distance;
293
                index_nearest_feature = index_entry;
294
                
295
        return (minimum_distance, index_nearest_feature);
296