1
|
|
|
"""!
|
2
|
|
|
|
3
|
|
|
@brief Data Structure: CF-Tree
|
4
|
|
|
@details Based on book description:
|
5
|
|
|
- M.Zhang, R.Ramakrishnan, M.Livny. BIRCH: An Efficient Data Clustering Method for Very Large Databases. 1996.
|
6
|
|
|
|
7
|
|
|
@authors Andrei Novikov ([email protected])
|
8
|
|
|
@date 2014-2017
|
9
|
|
|
@copyright GNU Public License
|
10
|
|
|
|
11
|
|
|
@cond GNU_PUBLIC_LICENSE
|
12
|
|
|
PyClustering is free software: you can redistribute it and/or modify
|
13
|
|
|
it under the terms of the GNU General Public License as published by
|
14
|
|
|
the Free Software Foundation, either version 3 of the License, or
|
15
|
|
|
(at your option) any later version.
|
16
|
|
|
|
17
|
|
|
PyClustering is distributed in the hope that it will be useful,
|
18
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
19
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
20
|
|
|
GNU General Public License for more details.
|
21
|
|
|
|
22
|
|
|
You should have received a copy of the GNU General Public License
|
23
|
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
24
|
|
|
@endcond
|
25
|
|
|
|
26
|
|
|
"""
|
27
|
|
|
|
28
|
|
|
from copy import copy;
|
29
|
|
|
|
30
|
|
|
from pyclustering.cluster import cluster_visualizer;
|
31
|
|
|
|
32
|
|
|
from pyclustering.utils import euclidean_distance_sqrt;
|
33
|
|
|
from pyclustering.utils import manhattan_distance;
|
34
|
|
|
from pyclustering.utils import list_math_addition, list_math_subtraction, list_math_multiplication;
|
35
|
|
|
from pyclustering.utils import linear_sum, square_sum;
|
36
|
|
|
|
37
|
|
|
from enum import IntEnum;
|
38
|
|
|
|
39
|
|
|
|
40
|
|
|
class measurement_type(IntEnum):
|
41
|
|
|
"""!
|
42
|
|
|
@brief Enumeration of measurement types for CF-Tree.
|
43
|
|
|
|
44
|
|
|
@see cftree
|
45
|
|
|
|
46
|
|
|
"""
|
47
|
|
|
|
48
|
|
|
## Euclidian distance between centroids of clustering features.
|
49
|
|
|
CENTROID_EUCLIDIAN_DISTANCE = 0;
|
50
|
|
|
|
51
|
|
|
## Manhattan distance between centroids of clustering features.
|
52
|
|
|
CENTROID_MANHATTAN_DISTANCE = 1;
|
53
|
|
|
|
54
|
|
|
## Average distance between all objects from clustering features.
|
55
|
|
|
AVERAGE_INTER_CLUSTER_DISTANCE = 2;
|
56
|
|
|
|
57
|
|
|
## Average distance between all objects within clustering features and between them.
|
58
|
|
|
AVERAGE_INTRA_CLUSTER_DISTANCE = 3;
|
59
|
|
|
|
60
|
|
|
## Variance based distance between clustering features.
|
61
|
|
|
VARIANCE_INCREASE_DISTANCE = 4;
|
62
|
|
|
|
63
|
|
|
|
64
|
|
|
class cfnode_type(IntEnum):
|
65
|
|
|
"""!
|
66
|
|
|
@brief Enumeration of CF-Node types that are used by CF-Tree.
|
67
|
|
|
|
68
|
|
|
@see cfnode
|
69
|
|
|
@see cftree
|
70
|
|
|
|
71
|
|
|
"""
|
72
|
|
|
|
73
|
|
|
## Undefined node.
|
74
|
|
|
CFNODE_DUMMY = 0;
|
75
|
|
|
|
76
|
|
|
## Leaf node hasn't got successors, only entries.
|
77
|
|
|
CFNODE_LEAF = 1;
|
78
|
|
|
|
79
|
|
|
## Non-leaf node has got successors and hasn't got entries.
|
80
|
|
|
CFNODE_NONLEAF = 2;
|
81
|
|
|
|
82
|
|
|
|
83
|
|
|
class cfentry:
|
84
|
|
|
"""!
|
85
|
|
|
@brief Clustering feature representation.
|
86
|
|
|
|
87
|
|
|
@see cfnode
|
88
|
|
|
@see cftree
|
89
|
|
|
|
90
|
|
|
"""
|
91
|
|
|
|
92
|
|
|
@property
|
93
|
|
|
def number_points(self):
|
94
|
|
|
"""!
|
95
|
|
|
@brief Returns number of points that are encoded.
|
96
|
|
|
|
97
|
|
|
@return (uint) Number of encoded points.
|
98
|
|
|
|
99
|
|
|
"""
|
100
|
|
|
return self.__number_points;
|
101
|
|
|
|
102
|
|
|
@property
|
103
|
|
|
def linear_sum(self):
|
104
|
|
|
"""!
|
105
|
|
|
@brief Returns linear sum.
|
106
|
|
|
|
107
|
|
|
@return (list) Linear sum.
|
108
|
|
|
|
109
|
|
|
"""
|
110
|
|
|
|
111
|
|
|
return self.__linear_sum;
|
112
|
|
|
|
113
|
|
|
@property
|
114
|
|
|
def square_sum(self):
|
115
|
|
|
"""!
|
116
|
|
|
@brief Returns square sum.
|
117
|
|
|
|
118
|
|
|
@return (double) Square sum.
|
119
|
|
|
|
120
|
|
|
"""
|
121
|
|
|
return self.__square_sum;
|
122
|
|
|
|
123
|
|
|
|
124
|
|
|
def __init__(self, number_points, linear_sum, square_sum):
|
|
|
|
|
125
|
|
|
"""!
|
126
|
|
|
@brief CF-entry constructor.
|
127
|
|
|
|
128
|
|
|
@param[in] number_points (uint): Number of objects that is represented by the entry.
|
129
|
|
|
@param[in] linear_sum (list): Linear sum of values that represent objects in each dimension.
|
130
|
|
|
@param[in] square_sum (double): Square sum of values that represent objects.
|
131
|
|
|
|
132
|
|
|
"""
|
133
|
|
|
|
134
|
|
|
self.__number_points = number_points;
|
135
|
|
|
self.__linear_sum = linear_sum;
|
136
|
|
|
self.__square_sum = square_sum;
|
137
|
|
|
|
138
|
|
|
self.__centroid = None;
|
139
|
|
|
self.__radius = None;
|
140
|
|
|
self.__diameter = None;
|
141
|
|
|
|
142
|
|
|
|
143
|
|
|
def __copy__(self):
|
144
|
|
|
"""!
|
145
|
|
|
@returns (cfentry) Makes copy of the cfentry instance.
|
146
|
|
|
|
147
|
|
|
"""
|
148
|
|
|
return cfentry(self.__number_points, self.__linear_sum, self.__square_sum);
|
149
|
|
|
|
150
|
|
|
|
151
|
|
|
def __repr__(self):
|
152
|
|
|
"""!
|
153
|
|
|
@return (string) Default cfentry representation.
|
154
|
|
|
|
155
|
|
|
"""
|
156
|
|
|
return 'CF (%s, %s, %0.2f) [%s]' % ( self.number_points, self.linear_sum, self.__square_sum, hex(id(self)) );
|
157
|
|
|
|
158
|
|
|
|
159
|
|
|
def __str__(self):
|
160
|
|
|
"""!
|
161
|
|
|
@brief Default cfentry string representation.
|
162
|
|
|
|
163
|
|
|
"""
|
164
|
|
|
return self.__repr__();
|
165
|
|
|
|
166
|
|
|
|
167
|
|
|
def __add__(self, entry):
|
168
|
|
|
"""!
|
169
|
|
|
@brief Overloaded operator add. Performs addition of two clustering features.
|
170
|
|
|
|
171
|
|
|
@param[in] entry (cfentry): Entry that is added to the current.
|
172
|
|
|
|
173
|
|
|
@return (cfentry) Result of addition of two clustering features.
|
174
|
|
|
|
175
|
|
|
"""
|
176
|
|
|
|
177
|
|
|
number_points = self.number_points + entry.number_points;
|
178
|
|
|
result_linear_sum = list_math_addition(self.linear_sum, entry.linear_sum);
|
179
|
|
|
result_square_sum = self.square_sum + entry.square_sum;
|
180
|
|
|
|
181
|
|
|
return cfentry(number_points, result_linear_sum, result_square_sum);
|
182
|
|
|
|
183
|
|
|
|
184
|
|
|
def __sub__(self, entry):
|
185
|
|
|
"""!
|
186
|
|
|
@brief Overloaded operator sub. Performs substraction of two clustering features.
|
187
|
|
|
@details Substraction can't be performed with clustering feature whose description is less then substractor.
|
188
|
|
|
|
189
|
|
|
@param[in] entry (cfentry): Entry that is substracted from the current.
|
190
|
|
|
|
191
|
|
|
@return (cfentry) Result of substraction of two clustering features.
|
192
|
|
|
|
193
|
|
|
"""
|
194
|
|
|
|
195
|
|
|
number_points = self.number_points - entry.number_points;
|
196
|
|
|
result_linear_sum = list_math_subtraction(self.linear_sum, entry.linear_sum);
|
197
|
|
|
result_square_sum = self.square_sum - entry.square_sum;
|
198
|
|
|
|
199
|
|
|
if ( (number_points < 0) or (result_square_sum < 0) ):
|
200
|
|
|
raise NameError("Substruction with negative result is not possible for clustering features.");
|
201
|
|
|
|
202
|
|
|
return cfentry(number_points, result_linear_sum, result_square_sum);
|
203
|
|
|
|
204
|
|
|
|
205
|
|
|
def __eq__(self, entry):
|
206
|
|
|
"""!
|
207
|
|
|
@brief Overloaded operator eq.
|
208
|
|
|
@details Performs comparison of two clustering features.
|
209
|
|
|
|
210
|
|
|
@param[in] entry (cfentry): Entry that is used for comparison with current.
|
211
|
|
|
|
212
|
|
|
@return (bool) True is both clustering features are equals in line with tolerance, otherwise False.
|
213
|
|
|
|
214
|
|
|
"""
|
215
|
|
|
|
216
|
|
|
tolerance = 0.00001;
|
217
|
|
|
|
218
|
|
|
result = (self.__number_points == entry.number_points);
|
219
|
|
|
result &= ( (self.square_sum + tolerance > entry.square_sum) and (self.square_sum - tolerance < entry.square_sum) );
|
220
|
|
|
|
221
|
|
|
for index_dimension in range(0, len(self.linear_sum)):
|
222
|
|
|
result &= ( (self.linear_sum[index_dimension] + tolerance > entry.linear_sum[index_dimension]) and (self.linear_sum[index_dimension] - tolerance < entry.linear_sum[index_dimension]) );
|
223
|
|
|
|
224
|
|
|
return result;
|
225
|
|
|
|
226
|
|
|
|
227
|
|
|
def get_distance(self, entry, type_measurement):
|
228
|
|
|
"""!
|
229
|
|
|
@brief Calculates distance between two clusters in line with measurement type.
|
230
|
|
|
|
231
|
|
|
@details In case of usage CENTROID_EUCLIDIAN_DISTANCE square euclidian distance will be returned.
|
232
|
|
|
Square root should be taken from the result for obtaining real euclidian distance between
|
233
|
|
|
entries.
|
234
|
|
|
|
235
|
|
|
@param[in] entry (cfentry): Clustering feature to which distance should be obtained.
|
236
|
|
|
@param[in] type_measurement (measurement_type): Distance measurement algorithm between two clusters.
|
237
|
|
|
|
238
|
|
|
@return (double) Distance between two clusters.
|
239
|
|
|
|
240
|
|
|
"""
|
241
|
|
|
|
242
|
|
|
if (type_measurement is measurement_type.CENTROID_EUCLIDIAN_DISTANCE):
|
243
|
|
|
return euclidean_distance_sqrt(entry.get_centroid(), self.get_centroid());
|
244
|
|
|
|
245
|
|
|
elif (type_measurement is measurement_type.CENTROID_MANHATTAN_DISTANCE):
|
246
|
|
|
return manhattan_distance(entry.get_centroid(), self.get_centroid());
|
247
|
|
|
|
248
|
|
|
elif (type_measurement is measurement_type.AVERAGE_INTER_CLUSTER_DISTANCE):
|
249
|
|
|
return self.__get_average_inter_cluster_distance(entry);
|
250
|
|
|
|
251
|
|
|
elif (type_measurement is measurement_type.AVERAGE_INTRA_CLUSTER_DISTANCE):
|
252
|
|
|
return self.__get_average_intra_cluster_distance(entry);
|
253
|
|
|
|
254
|
|
|
elif (type_measurement is measurement_type.VARIANCE_INCREASE_DISTANCE):
|
255
|
|
|
return self.__get_variance_increase_distance(entry);
|
256
|
|
|
|
257
|
|
|
else:
|
258
|
|
|
assert 0;
|
259
|
|
|
|
260
|
|
|
|
261
|
|
|
def get_centroid(self):
|
262
|
|
|
"""!
|
263
|
|
|
@brief Calculates centroid of cluster that is represented by the entry.
|
264
|
|
|
@details It's calculated once when it's requested after the last changes.
|
265
|
|
|
|
266
|
|
|
@return (list) Centroid of cluster that is represented by the entry.
|
267
|
|
|
|
268
|
|
|
"""
|
269
|
|
|
|
270
|
|
|
if (self.__centroid is not None):
|
271
|
|
|
return self.__centroid;
|
272
|
|
|
|
273
|
|
|
self.__centroid = [0] * len(self.linear_sum);
|
274
|
|
|
for index_dimension in range(0, len(self.linear_sum)):
|
275
|
|
|
self.__centroid[index_dimension] = self.linear_sum[index_dimension] / self.number_points;
|
276
|
|
|
|
277
|
|
|
return self.__centroid;
|
278
|
|
|
|
279
|
|
|
|
280
|
|
|
def get_radius(self):
|
281
|
|
|
"""!
|
282
|
|
|
@brief Calculates radius of cluster that is represented by the entry.
|
283
|
|
|
@details It's calculated once when it's requested after the last changes.
|
284
|
|
|
|
285
|
|
|
@return (double) Radius of cluster that is represented by the entry.
|
286
|
|
|
|
287
|
|
|
"""
|
288
|
|
|
|
289
|
|
|
if (self.__radius is not None):
|
290
|
|
|
return self.__radius;
|
291
|
|
|
|
292
|
|
|
centroid = self.get_centroid();
|
293
|
|
|
|
294
|
|
|
radius_part_1 = self.square_sum;
|
295
|
|
|
|
296
|
|
|
radius_part_2 = 0.0;
|
297
|
|
|
radius_part_3 = 0.0;
|
298
|
|
|
|
299
|
|
|
if (type(centroid) == list):
|
300
|
|
|
radius_part_2 = 2.0 * sum(list_math_multiplication(self.linear_sum, centroid));
|
301
|
|
|
radius_part_3 = self.number_points * sum(list_math_multiplication(centroid, centroid));
|
302
|
|
|
else:
|
303
|
|
|
radius_part_2 = 2.0 * self.linear_sum * centroid;
|
304
|
|
|
radius_part_3 = self.number_points * centroid * centroid;
|
305
|
|
|
|
306
|
|
|
self.__radius = ( (1.0 / self.number_points) * (radius_part_1 - radius_part_2 + radius_part_3) ) ** 0.5;
|
307
|
|
|
return self.__radius;
|
308
|
|
|
|
309
|
|
|
|
310
|
|
|
def get_diameter(self):
|
311
|
|
|
"""!
|
312
|
|
|
@brief Calculates diameter of cluster that is represented by the entry.
|
313
|
|
|
@details It's calculated once when it's requested after the last changes.
|
314
|
|
|
|
315
|
|
|
@return (double) Diameter of cluster that is represented by the entry.
|
316
|
|
|
|
317
|
|
|
"""
|
318
|
|
|
|
319
|
|
|
if (self.__diameter is not None):
|
320
|
|
|
return self.__diameter;
|
321
|
|
|
|
322
|
|
|
diameter_part = 0.0;
|
323
|
|
|
if (type(self.linear_sum) == list):
|
324
|
|
|
diameter_part = self.square_sum * self.number_points - 2.0 * sum(list_math_multiplication(self.linear_sum, self.linear_sum)) + self.square_sum * self.number_points;
|
325
|
|
|
else:
|
326
|
|
|
diameter_part = self.square_sum * self.number_points - 2.0 * self.linear_sum * self.linear_sum + self.square_sum * self.number_points;
|
327
|
|
|
|
328
|
|
|
self.__diameter = ( diameter_part / (self.number_points * (self.number_points - 1)) ) ** 0.5;
|
329
|
|
|
return self.__diameter;
|
330
|
|
|
|
331
|
|
|
|
332
|
|
|
def __get_average_inter_cluster_distance(self, entry):
|
333
|
|
|
"""!
|
334
|
|
|
@brief Calculates average inter cluster distance between current and specified clusters.
|
335
|
|
|
|
336
|
|
|
@param[in] entry (cfentry): Clustering feature to which distance should be obtained.
|
337
|
|
|
|
338
|
|
|
@return (double) Average inter cluster distance.
|
339
|
|
|
|
340
|
|
|
"""
|
341
|
|
|
|
342
|
|
|
linear_part_distance = sum(list_math_multiplication(self.linear_sum, entry.linear_sum));
|
343
|
|
|
|
344
|
|
|
return ( (entry.number_points * self.square_sum - 2.0 * linear_part_distance + self.number_points * entry.square_sum) / (self.number_points * entry.number_points) ) ** 0.5;
|
345
|
|
|
|
346
|
|
|
|
347
|
|
|
def __get_average_intra_cluster_distance(self, entry):
|
348
|
|
|
"""!
|
349
|
|
|
@brief Calculates average intra cluster distance between current and specified clusters.
|
350
|
|
|
|
351
|
|
|
@param[in] entry (cfentry): Clustering feature to which distance should be obtained.
|
352
|
|
|
|
353
|
|
|
@return (double) Average intra cluster distance.
|
354
|
|
|
|
355
|
|
|
"""
|
356
|
|
|
|
357
|
|
|
linear_part_first = list_math_addition(self.linear_sum, entry.linear_sum);
|
358
|
|
|
linear_part_second = linear_part_first;
|
359
|
|
|
|
360
|
|
|
linear_part_distance = sum(list_math_multiplication(linear_part_first, linear_part_second));
|
361
|
|
|
|
362
|
|
|
general_part_distance = 2.0 * (self.number_points + entry.number_points) * (self.square_sum + entry.square_sum) - 2.0 * linear_part_distance;
|
363
|
|
|
|
364
|
|
|
return (general_part_distance / ( (self.number_points + entry.number_points) * (self.number_points + entry.number_points - 1.0) )) ** 0.5;
|
365
|
|
|
|
366
|
|
|
|
367
|
|
|
def __get_variance_increase_distance(self, entry):
|
368
|
|
|
"""!
|
369
|
|
|
@brief Calculates variance increase distance between current and specified clusters.
|
370
|
|
|
|
371
|
|
|
@param[in] entry (cfentry): Clustering feature to which distance should be obtained.
|
372
|
|
|
|
373
|
|
|
@return (double) Variance increase distance.
|
374
|
|
|
|
375
|
|
|
"""
|
376
|
|
|
|
377
|
|
|
linear_part_12 = list_math_addition(self.linear_sum, entry.linear_sum);
|
378
|
|
|
variance_part_first = (self.square_sum + entry.square_sum) - \
|
379
|
|
|
2.0 * sum(list_math_multiplication(linear_part_12, linear_part_12)) / (self.number_points + entry.number_points) + \
|
380
|
|
|
(self.number_points + entry.number_points) * sum(list_math_multiplication(linear_part_12, linear_part_12)) / (self.number_points + entry.number_points)**2.0;
|
381
|
|
|
|
382
|
|
|
|
383
|
|
|
linear_part_11 = sum(list_math_multiplication(self.linear_sum, self.linear_sum));
|
384
|
|
|
variance_part_second = -( self.square_sum - (2.0 * linear_part_11 / self.number_points) + (linear_part_11 / self.number_points) );
|
385
|
|
|
|
386
|
|
|
linear_part_22 = sum(list_math_multiplication(entry.linear_sum, entry.linear_sum));
|
387
|
|
|
variance_part_third = -( entry.square_sum - (2.0 / entry.number_points) * linear_part_22 + entry.number_points * (1.0 / entry.number_points ** 2.0) * linear_part_22 );
|
388
|
|
|
|
389
|
|
|
return (variance_part_first + variance_part_second + variance_part_third);
|
390
|
|
|
|
391
|
|
|
|
392
|
|
|
class cfnode:
|
393
|
|
|
"""!
|
394
|
|
|
@brief Representation of node of CF-Tree.
|
395
|
|
|
|
396
|
|
|
"""
|
397
|
|
|
|
398
|
|
|
def __init__(self, feature, parent, payload):
|
399
|
|
|
"""!
|
400
|
|
|
@brief Constructor of abstract CF node.
|
401
|
|
|
|
402
|
|
|
@param[in] feature (cfentry): Clustering feature of the created node.
|
403
|
|
|
@param[in] parent (cfnode): Parent of the created node.
|
404
|
|
|
@param[in] payload (*): Data that is stored by the node.
|
405
|
|
|
|
406
|
|
|
"""
|
407
|
|
|
|
408
|
|
|
## Clustering feature of the node.
|
409
|
|
|
self.feature = copy(feature);
|
410
|
|
|
|
411
|
|
|
## Pointer to the parent node (None for root).
|
412
|
|
|
self.parent = parent;
|
413
|
|
|
|
414
|
|
|
## Type node (leaf or non-leaf).
|
415
|
|
|
self.type = cfnode_type.CFNODE_DUMMY;
|
416
|
|
|
|
417
|
|
|
## Payload of node where user data can be stored.
|
418
|
|
|
self.payload = payload;
|
419
|
|
|
|
420
|
|
|
|
421
|
|
|
def __repr__(self):
|
422
|
|
|
"""!
|
423
|
|
|
@return (string) Default representation of CF node.
|
424
|
|
|
|
425
|
|
|
"""
|
426
|
|
|
|
427
|
|
|
return 'CF node %s, parent %s, feature %s' % ( hex(id(self)), self.parent, self.feature );
|
428
|
|
|
|
429
|
|
|
|
430
|
|
|
def __str__(self):
|
431
|
|
|
"""!
|
432
|
|
|
@return (string) String representation of CF node.
|
433
|
|
|
|
434
|
|
|
"""
|
435
|
|
|
return self.__repr__();
|
436
|
|
|
|
437
|
|
|
|
438
|
|
|
def get_distance(self, node, type_measurement):
|
439
|
|
|
"""!
|
440
|
|
|
@brief Calculates distance between nodes in line with specified type measurement.
|
441
|
|
|
|
442
|
|
|
@param[in] node (cfnode): CF-node that is used for calculation distance to the current node.
|
443
|
|
|
@param[in] type_measurement (measurement_type): Measurement type that is used for calculation distance.
|
444
|
|
|
|
445
|
|
|
@return (double) Distance between two nodes.
|
446
|
|
|
|
447
|
|
|
"""
|
448
|
|
|
|
449
|
|
|
return self.feature.get_distance(node.feature, type_measurement);
|
450
|
|
|
|
451
|
|
|
|
452
|
|
|
class non_leaf_node(cfnode):
|
453
|
|
|
"""!
|
454
|
|
|
@brief Representation of clustering feature non-leaf node.
|
455
|
|
|
|
456
|
|
|
"""
|
457
|
|
|
|
458
|
|
|
@property
|
459
|
|
|
def successors(self):
|
460
|
|
|
"""!
|
461
|
|
|
@return (list) List of successors of the node.
|
462
|
|
|
|
463
|
|
|
"""
|
464
|
|
|
return self.__successors;
|
465
|
|
|
|
466
|
|
|
|
467
|
|
|
def __init__(self, feature, parent, successors, payload):
|
468
|
|
|
"""!
|
469
|
|
|
@brief Create CF Non-leaf node.
|
470
|
|
|
|
471
|
|
|
@param[in] feature (cfentry): Clustering feature of the created node.
|
472
|
|
|
@param[in] parent (non_leaf_node): Parent of the created node.
|
473
|
|
|
@param[in] successors (list): List of successors of the node.
|
474
|
|
|
@param[in] payload (*): Data that is stored by the node.
|
475
|
|
|
|
476
|
|
|
"""
|
477
|
|
|
|
478
|
|
|
super().__init__(feature, parent, payload);
|
479
|
|
|
|
480
|
|
|
## Node type in CF tree that is CFNODE_NONLEAF for non leaf node.
|
481
|
|
|
self.type = cfnode_type.CFNODE_NONLEAF;
|
482
|
|
|
|
483
|
|
|
self.__successors = successors;
|
484
|
|
|
|
485
|
|
|
|
486
|
|
|
def __repr__(self):
|
487
|
|
|
"""!
|
488
|
|
|
@return (string) Representation of non-leaf node representation.
|
489
|
|
|
|
490
|
|
|
"""
|
491
|
|
|
return 'Non-leaf node %s, parent %s, feature %s, successors: %d' % ( hex(id(self)), self.parent, self.feature, len(self.successors) );
|
492
|
|
|
|
493
|
|
|
|
494
|
|
|
def __str__(self):
|
495
|
|
|
"""!
|
496
|
|
|
@return (string) String non-leaf representation.
|
497
|
|
|
|
498
|
|
|
"""
|
499
|
|
|
return self.__repr__();
|
500
|
|
|
|
501
|
|
|
|
502
|
|
|
def insert_successor(self, successor):
|
503
|
|
|
"""!
|
504
|
|
|
@brief Insert successor to the node.
|
505
|
|
|
|
506
|
|
|
@param[in] successor (cfnode): Successor for adding.
|
507
|
|
|
|
508
|
|
|
"""
|
509
|
|
|
|
510
|
|
|
self.feature += successor.feature;
|
511
|
|
|
self.successors.append(successor);
|
512
|
|
|
|
513
|
|
|
successor.parent = self;
|
514
|
|
|
|
515
|
|
|
|
516
|
|
|
def remove_successor(self, successor):
|
517
|
|
|
"""!
|
518
|
|
|
@brief Remove successor from the node.
|
519
|
|
|
|
520
|
|
|
@param[in] successor (cfnode): Successor for removing.
|
521
|
|
|
|
522
|
|
|
"""
|
523
|
|
|
|
524
|
|
|
self.feature -= successor.feature;
|
525
|
|
|
self.successors.append(successor);
|
526
|
|
|
|
527
|
|
|
successor.parent = self;
|
528
|
|
|
|
529
|
|
|
|
530
|
|
|
def merge(self, node):
|
531
|
|
|
"""!
|
532
|
|
|
@brief Merge non-leaf node to the current.
|
533
|
|
|
|
534
|
|
|
@param[in] node (non_leaf_node): Non-leaf node that should be merged with current.
|
535
|
|
|
|
536
|
|
|
"""
|
537
|
|
|
|
538
|
|
|
self.feature += node.feature;
|
539
|
|
|
|
540
|
|
|
for child in node.successors:
|
541
|
|
|
child.parent = self;
|
542
|
|
|
self.successors.append(child);
|
543
|
|
|
|
544
|
|
|
|
545
|
|
|
def get_farthest_successors(self, type_measurement):
|
546
|
|
|
"""!
|
547
|
|
|
@brief Find pair of farthest successors of the node in line with measurement type.
|
548
|
|
|
|
549
|
|
|
@param[in] type_measurement (measurement_type): Measurement type that is used for obtaining farthest successors.
|
550
|
|
|
|
551
|
|
|
@return (list) Pair of farthest successors represented by list [cfnode1, cfnode2].
|
552
|
|
|
|
553
|
|
|
"""
|
554
|
|
|
|
555
|
|
|
farthest_node1 = None;
|
556
|
|
|
farthest_node2 = None;
|
557
|
|
|
farthest_distance = 0;
|
558
|
|
|
|
559
|
|
|
for i in range(0, len(self.successors)):
|
560
|
|
|
candidate1 = self.successors[i];
|
561
|
|
|
|
562
|
|
|
for j in range(i + 1, len(self.successors)):
|
563
|
|
|
candidate2 = self.successors[j];
|
564
|
|
|
candidate_distance = candidate1.get_distance(candidate2, type_measurement);
|
565
|
|
|
|
566
|
|
|
if (candidate_distance > farthest_distance):
|
567
|
|
|
farthest_distance = candidate_distance;
|
568
|
|
|
farthest_node1 = candidate1;
|
569
|
|
|
farthest_node2 = candidate2;
|
570
|
|
View Code Duplication |
|
|
|
|
|
571
|
|
|
return [farthest_node1, farthest_node2];
|
572
|
|
|
|
573
|
|
|
|
574
|
|
|
def get_nearest_successors(self, type_measurement):
|
575
|
|
|
"""!
|
576
|
|
|
@brief Find pair of nearest successors of the node in line with measurement type.
|
577
|
|
|
|
578
|
|
|
@param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest successors.
|
579
|
|
|
|
580
|
|
|
@return (list) Pair of nearest successors represented by list.
|
581
|
|
|
|
582
|
|
|
"""
|
583
|
|
|
|
584
|
|
|
nearest_node1 = None;
|
585
|
|
|
nearest_node2 = None;
|
586
|
|
|
nearest_distance = float("Inf");
|
587
|
|
|
|
588
|
|
|
for i in range(0, len(self.successors)):
|
589
|
|
|
candidate1 = self.successors[i];
|
590
|
|
|
|
591
|
|
|
for j in range(i + 1, len(self.successors)):
|
592
|
|
|
candidate2 = self.successors[j];
|
593
|
|
|
candidate_distance = candidate1.get_distance(candidate2, type_measurement);
|
594
|
|
|
|
595
|
|
|
if (candidate_distance < nearest_distance):
|
596
|
|
|
nearest_distance = candidate_distance;
|
597
|
|
|
nearest_node1 = candidate1;
|
598
|
|
|
nearest_node2 = candidate2;
|
599
|
|
|
|
600
|
|
|
return [nearest_node1, nearest_node2];
|
601
|
|
|
|
602
|
|
|
|
603
|
|
|
class leaf_node(cfnode):
|
604
|
|
|
"""!
|
605
|
|
|
@brief Represents clustering feature leaf node.
|
606
|
|
|
|
607
|
|
|
"""
|
608
|
|
|
|
609
|
|
|
@property
|
610
|
|
|
def entries(self):
|
611
|
|
|
"""!
|
612
|
|
|
@return (list) List of leaf nodes.
|
613
|
|
|
|
614
|
|
|
"""
|
615
|
|
|
return self.__entries;
|
616
|
|
|
|
617
|
|
|
|
618
|
|
|
def __init__(self, feature, parent, entries, payload):
|
619
|
|
|
"""!
|
620
|
|
|
@brief Create CF Leaf node.
|
621
|
|
|
|
622
|
|
|
@param[in] feature (cfentry): Clustering feature of the created node.
|
623
|
|
|
@param[in] parent (non_leaf_node): Parent of the created node.
|
624
|
|
|
@param[in] entries (list): List of entries of the node.
|
625
|
|
|
@param[in] payload (*): Data that is stored by the node.
|
626
|
|
|
|
627
|
|
|
"""
|
628
|
|
|
|
629
|
|
|
super().__init__(feature, parent, payload);
|
630
|
|
|
|
631
|
|
|
## Node type in CF tree that is CFNODE_LEAF for leaf node.
|
632
|
|
|
self.type = cfnode_type.CFNODE_LEAF;
|
633
|
|
|
|
634
|
|
|
self.__entries = entries; # list of clustering features
|
635
|
|
|
|
636
|
|
|
|
637
|
|
|
def __repr__(self):
|
638
|
|
|
"""!
|
639
|
|
|
@return (string) Default leaf node represenation.
|
640
|
|
|
|
641
|
|
|
"""
|
642
|
|
|
text_entries = "\n";
|
643
|
|
|
for entry in self.entries:
|
644
|
|
|
text_entries += "\t" + str(entry) + "\n";
|
645
|
|
|
|
646
|
|
|
return 'Leaf-node %s, parent %s, feature %s, entries: %d %s' % ( hex(id(self)), self.parent, self.feature, len(self.entries), text_entries );
|
647
|
|
|
|
648
|
|
|
|
649
|
|
|
def __str__(self):
|
650
|
|
|
"""!
|
651
|
|
|
@return (string) String leaf node representation.
|
652
|
|
|
|
653
|
|
|
"""
|
654
|
|
|
return self.__repr__();
|
655
|
|
|
|
656
|
|
|
|
657
|
|
|
def insert_entry(self, entry):
|
658
|
|
|
"""!
|
659
|
|
|
@brief Insert new clustering feature to the leaf node.
|
660
|
|
|
|
661
|
|
|
@param[in] entry (cfentry): Clustering feature.
|
662
|
|
|
|
663
|
|
|
"""
|
664
|
|
|
|
665
|
|
|
self.feature += entry;
|
666
|
|
|
self.entries.append(entry);
|
667
|
|
|
|
668
|
|
|
|
669
|
|
|
def remove_entry(self, entry):
|
670
|
|
|
"""!
|
671
|
|
|
@brief Remove clustering feature from the leaf node.
|
672
|
|
|
|
673
|
|
|
@param[in] entry (cfentry): Clustering feature.
|
674
|
|
|
|
675
|
|
|
"""
|
676
|
|
|
|
677
|
|
|
self.feature -= entry;
|
678
|
|
|
self.entries.remove(entry);
|
679
|
|
|
|
680
|
|
|
|
681
|
|
|
def merge(self, node):
|
682
|
|
|
"""!
|
683
|
|
|
@brief Merge leaf node to the current.
|
684
|
|
|
|
685
|
|
|
@param[in] node (leaf_node): Leaf node that should be merged with current.
|
686
|
|
|
|
687
|
|
|
"""
|
688
|
|
|
|
689
|
|
|
self.feature += node.feature;
|
690
|
|
|
|
691
|
|
|
# Move entries from merged node
|
692
|
|
|
for entry in node.entries:
|
693
|
|
|
self.entries.append(entry);
|
694
|
|
|
|
695
|
|
|
|
696
|
|
|
def get_farthest_entries(self, type_measurement):
|
697
|
|
|
"""!
|
698
|
|
|
@brief Find pair of farthest entries of the node.
|
699
|
|
|
|
700
|
|
|
@param[in] type_measurement (measurement_type): Measurement type that is used for obtaining farthest entries.
|
701
|
|
|
|
702
|
|
|
@return (list) Pair of farthest entries of the node that are represented by list.
|
703
|
|
|
|
704
|
|
|
"""
|
705
|
|
|
|
706
|
|
|
farthest_entity1 = None;
|
707
|
|
|
farthest_entity2 = None;
|
708
|
|
|
farthest_distance = 0;
|
709
|
|
|
|
710
|
|
|
for i in range(0, len(self.entries)):
|
711
|
|
|
candidate1 = self.entries[i];
|
712
|
|
|
|
713
|
|
|
for j in range(i + 1, len(self.entries)):
|
714
|
|
|
candidate2 = self.entries[j];
|
715
|
|
|
candidate_distance = candidate1.get_distance(candidate2, type_measurement);
|
716
|
|
|
|
717
|
|
|
if (candidate_distance > farthest_distance):
|
718
|
|
|
farthest_distance = candidate_distance;
|
719
|
|
|
farthest_entity1 = candidate1;
|
720
|
|
|
farthest_entity2 = candidate2;
|
721
|
|
|
|
722
|
|
|
return [farthest_entity1, farthest_entity2];
|
723
|
|
|
|
724
|
|
|
|
725
|
|
|
def get_nearest_index_entry(self, entry, type_measurement):
|
726
|
|
|
"""!
|
727
|
|
|
@brief Find nearest index of nearest entry of node for the specified entry.
|
728
|
|
|
|
729
|
|
|
@param[in] entry (cfentry): Entry that is used for calculation distance.
|
730
|
|
|
@param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest entry to the specified.
|
731
|
|
|
|
732
|
|
|
@return (uint) Index of nearest entry of node for the specified entry.
|
733
|
|
|
|
734
|
|
|
"""
|
735
|
|
|
|
736
|
|
|
minimum_distance = float('Inf');
|
737
|
|
|
nearest_index = 0;
|
738
|
|
|
|
739
|
|
|
for candidate_index in range(0, len(self.entries)):
|
740
|
|
|
candidate_distance = self.entries[candidate_index].get_distance(entry, type_measurement);
|
741
|
|
|
if (candidate_distance < minimum_distance):
|
742
|
|
|
nearest_index = candidate_index;
|
743
|
|
|
|
744
|
|
|
return nearest_index;
|
745
|
|
|
|
746
|
|
|
|
747
|
|
|
def get_nearest_entry(self, entry, type_measurement):
|
748
|
|
|
"""!
|
749
|
|
|
@brief Find nearest entry of node for the specified entry.
|
750
|
|
|
|
751
|
|
|
@param[in] entry (cfentry): Entry that is used for calculation distance.
|
752
|
|
|
@param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest entry to the specified.
|
753
|
|
|
|
754
|
|
|
@return (cfentry) Nearest entry of node for the specified entry.
|
755
|
|
|
|
756
|
|
|
"""
|
757
|
|
|
|
758
|
|
|
min_key = lambda cur_entity: cur_entity.get_distance(entry, type_measurement);
|
759
|
|
|
return min(self.__entries, key = min_key);
|
760
|
|
|
|
761
|
|
|
|
762
|
|
|
class cftree:
|
763
|
|
|
"""!
|
764
|
|
|
@brief CF-Tree representation.
|
765
|
|
|
@details A CF-tree is a height-balanced tree with two parameters: branching factor and threshold.
|
766
|
|
|
|
767
|
|
|
"""
|
768
|
|
|
|
769
|
|
|
@property
|
770
|
|
|
def root(self):
|
771
|
|
|
"""!
|
772
|
|
|
@return (cfnode) Root of the tree.
|
773
|
|
|
|
774
|
|
|
"""
|
775
|
|
|
return self.__root;
|
776
|
|
|
|
777
|
|
|
|
778
|
|
|
@property
|
779
|
|
|
def leafes(self):
|
780
|
|
|
"""!
|
781
|
|
|
@return (list) List of all non-leaf nodes in the tree.
|
782
|
|
|
|
783
|
|
|
"""
|
784
|
|
|
return self.__leafes;
|
785
|
|
|
|
786
|
|
|
|
787
|
|
|
@property
|
788
|
|
|
def amount_nodes(self):
|
789
|
|
|
"""!
|
790
|
|
|
@return (unit) Number of nodes (leaf and non-leaf) in the tree.
|
791
|
|
|
|
792
|
|
|
"""
|
793
|
|
|
return self.__amount_nodes;
|
794
|
|
|
|
795
|
|
|
|
796
|
|
|
@property
|
797
|
|
|
def amount_entries(self):
|
798
|
|
|
"""!
|
799
|
|
|
@return (uint) Number of entries in the tree.
|
800
|
|
|
|
801
|
|
|
"""
|
802
|
|
|
return self.__amount_entries;
|
803
|
|
|
|
804
|
|
|
|
805
|
|
|
@property
|
806
|
|
|
def height(self):
|
807
|
|
|
"""!
|
808
|
|
|
@return (uint) Height of the tree.
|
809
|
|
|
|
810
|
|
|
"""
|
811
|
|
|
return self.__height;
|
812
|
|
|
|
813
|
|
|
|
814
|
|
|
@property
|
815
|
|
|
def branch_factor(self):
|
816
|
|
|
"""!
|
817
|
|
|
@return (uint) Branching factor of the tree.
|
818
|
|
|
@details Branching factor defines maximum number of successors in each non-leaf node.
|
819
|
|
|
|
820
|
|
|
"""
|
821
|
|
|
return self.__branch_factor;
|
822
|
|
|
|
823
|
|
|
|
824
|
|
|
@property
|
825
|
|
|
def threshold(self):
|
826
|
|
|
"""!
|
827
|
|
|
@return (double) Threshold of the tree that represents maximum diameter of sub-clusters that is formed by leaf node entries.
|
828
|
|
|
|
829
|
|
|
"""
|
830
|
|
|
return self.__threshold;
|
831
|
|
|
|
832
|
|
|
|
833
|
|
|
@property
|
834
|
|
|
def max_entries(self):
|
835
|
|
|
"""!
|
836
|
|
|
@return (uint) Maximum number of entries in each leaf node.
|
837
|
|
|
|
838
|
|
|
"""
|
839
|
|
|
return self.__max_entries;
|
840
|
|
|
|
841
|
|
|
|
842
|
|
|
@property
|
843
|
|
|
def type_measurement(self):
|
844
|
|
|
"""!
|
845
|
|
|
@return (measurement_type) Type that is used for measuring.
|
846
|
|
|
|
847
|
|
|
"""
|
848
|
|
|
return self.__type_measurement;
|
849
|
|
|
|
850
|
|
|
|
851
|
|
|
def __init__(self, branch_factor, max_entries, threshold, type_measurement = measurement_type.CENTROID_EUCLIDIAN_DISTANCE):
|
852
|
|
|
"""!
|
853
|
|
|
@brief Create CF-tree.
|
854
|
|
|
|
855
|
|
|
@param[in] branch_factor (uint): Maximum number of children for non-leaf nodes.
|
856
|
|
|
@param[in] max_entries (uint): Maximum number of entries for leaf nodes.
|
857
|
|
|
@param[in] threshold (double): Maximum diameter of feature clustering for each leaf node.
|
858
|
|
|
@param[in] type_measurement (measurement_type): Measurement type that is used for calculation distance metrics.
|
859
|
|
|
|
860
|
|
|
"""
|
861
|
|
|
|
862
|
|
|
self.__root = None;
|
863
|
|
|
|
864
|
|
|
self.__branch_factor = branch_factor; # maximum number of children
|
865
|
|
|
if (self.__branch_factor < 2):
|
866
|
|
|
self.__branch_factor = 2;
|
867
|
|
|
|
868
|
|
|
|
869
|
|
|
self.__threshold = threshold; # maximum diameter of sub-clusters stored at the leaf nodes
|
870
|
|
|
self.__max_entries = max_entries;
|
871
|
|
|
|
872
|
|
|
self.__leafes = [];
|
873
|
|
|
|
874
|
|
|
self.__type_measurement = type_measurement;
|
875
|
|
|
|
876
|
|
|
# statistics
|
877
|
|
|
self.__amount_nodes = 0; # root, despite it can be None.
|
878
|
|
|
self.__amount_entries = 0;
|
879
|
|
|
self.__height = 0; # tree size with root.
|
880
|
|
|
|
881
|
|
|
|
882
|
|
|
def get_level_nodes(self, level):
|
883
|
|
|
"""!
|
884
|
|
|
@brief Traverses CF-tree to obtain nodes at the specified level.
|
885
|
|
|
|
886
|
|
|
@param[in] level (uint): CF-tree level from that nodes should be returned.
|
887
|
|
|
|
888
|
|
|
@return (list) List of CF-nodes that are located on the specified level of the CF-tree.
|
889
|
|
|
|
890
|
|
|
"""
|
891
|
|
|
|
892
|
|
|
level_nodes = [];
|
893
|
|
|
if (level < self.__height):
|
894
|
|
|
level_nodes = self.__recursive_get_level_nodes(level, self.__root);
|
895
|
|
|
|
896
|
|
|
return level_nodes;
|
897
|
|
|
|
898
|
|
|
|
899
|
|
|
def __recursive_get_level_nodes(self, level, node):
|
900
|
|
|
"""!
|
901
|
|
|
@brief Traverses CF-tree to obtain nodes at the specified level recursively.
|
902
|
|
|
|
903
|
|
|
@param[in] level (uint): Current CF-tree level.
|
904
|
|
|
@param[in] node (cfnode): CF-node from that traversing is performed.
|
905
|
|
|
|
906
|
|
|
@return (list) List of CF-nodes that are located on the specified level of the CF-tree.
|
907
|
|
|
|
908
|
|
|
"""
|
909
|
|
|
|
910
|
|
|
level_nodes = [];
|
911
|
|
|
if (level is 0):
|
912
|
|
|
level_nodes.append(node);
|
913
|
|
|
|
914
|
|
|
else:
|
915
|
|
|
for sucessor in node.successors:
|
916
|
|
|
level_nodes += self.__recursive_get_level_nodes(level - 1, sucessor);
|
917
|
|
|
|
918
|
|
|
return level_nodes;
|
919
|
|
|
|
920
|
|
|
|
921
|
|
|
def insert_cluster(self, cluster):
|
922
|
|
|
"""!
|
923
|
|
|
@brief Insert cluster that is represented as list of points where each point is represented by list of coordinates.
|
924
|
|
|
@details Clustering feature is created for that cluster and inserted to the tree.
|
925
|
|
|
|
926
|
|
|
@param[in] cluster (list): Cluster that is represented by list of points that should be inserted to the tree.
|
927
|
|
|
|
928
|
|
|
"""
|
929
|
|
|
|
930
|
|
|
entry = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster));
|
931
|
|
|
self.insert(entry);
|
932
|
|
|
|
933
|
|
|
|
934
|
|
|
def insert(self, entry):
|
935
|
|
|
"""!
|
936
|
|
|
@brief Insert clustering feature to the tree.
|
937
|
|
|
|
938
|
|
|
@param[in] entry (cfentry): Clustering feature that should be inserted.
|
939
|
|
|
|
940
|
|
|
"""
|
941
|
|
|
|
942
|
|
|
if (self.__root is None):
|
943
|
|
|
node = leaf_node(entry, None, [ entry ], None);
|
944
|
|
|
|
945
|
|
|
self.__root = node;
|
946
|
|
|
self.__leafes.append(node);
|
947
|
|
|
|
948
|
|
|
# Update statistics
|
949
|
|
|
self.__amount_entries += 1;
|
950
|
|
|
self.__amount_nodes += 1;
|
951
|
|
|
self.__height += 1; # root has successor now
|
952
|
|
|
else:
|
953
|
|
|
child_node_updation = self.__recursive_insert(entry, self.__root);
|
954
|
|
|
if (child_node_updation is True):
|
955
|
|
|
# Splitting has been finished, check for possibility to merge (at least we have already two children).
|
956
|
|
|
if (self.__merge_nearest_successors(self.__root) is True):
|
957
|
|
|
self.__amount_nodes -= 1;
|
958
|
|
|
|
959
|
|
|
|
960
|
|
|
def find_nearest_leaf(self, entry, search_node = None):
|
961
|
|
|
"""!
|
962
|
|
|
@brief Search nearest leaf to the specified clustering feature.
|
963
|
|
|
|
964
|
|
|
@param[in] entry (cfentry): Clustering feature.
|
965
|
|
|
@param[in] search_node (cfnode): Node from that searching should be started, if None then search process will be started for the root.
|
966
|
|
|
|
967
|
|
|
@return (leaf_node) Nearest node to the specified clustering feature.
|
968
|
|
|
|
969
|
|
|
"""
|
970
|
|
|
|
971
|
|
|
if (search_node is None):
|
972
|
|
|
search_node = self.__root;
|
973
|
|
|
|
974
|
|
|
nearest_node = search_node;
|
975
|
|
|
|
976
|
|
|
if (search_node.type == cfnode_type.CFNODE_NONLEAF):
|
977
|
|
|
min_key = lambda child_node: child_node.feature.get_distance(entry, self.__type_measurement);
|
978
|
|
|
nearest_child_node = min(search_node.successors, key = min_key);
|
979
|
|
|
|
980
|
|
|
nearest_node = self.find_nearest_leaf(entry, nearest_child_node);
|
981
|
|
|
|
982
|
|
|
return nearest_node;
|
983
|
|
|
|
984
|
|
|
|
985
|
|
|
def __recursive_insert(self, entry, search_node):
|
986
|
|
|
"""!
|
987
|
|
|
@brief Recursive insert of the entry to the tree.
|
988
|
|
|
@details It performs all required procedures during insertion such as splitting, merging.
|
989
|
|
|
|
990
|
|
|
@param[in] entry (cfentry): Clustering feature.
|
991
|
|
|
@param[in] search_node (cfnode): Node from that insertion should be started.
|
992
|
|
|
|
993
|
|
|
@return (bool) True if number of nodes at the below level is changed, otherwise False.
|
994
|
|
|
|
995
|
|
|
"""
|
996
|
|
|
|
997
|
|
|
# None-leaf node
|
998
|
|
|
if (search_node.type == cfnode_type.CFNODE_NONLEAF):
|
999
|
|
|
return self.__insert_for_noneleaf_node(entry, search_node);
|
1000
|
|
|
|
1001
|
|
|
# Leaf is reached
|
1002
|
|
|
else:
|
1003
|
|
|
return self.__insert_for_leaf_node(entry, search_node);
|
1004
|
|
|
|
1005
|
|
|
|
1006
|
|
|
def __insert_for_leaf_node(self, entry, search_node):
|
1007
|
|
|
"""!
|
1008
|
|
|
@brief Recursive insert entry from leaf node to the tree.
|
1009
|
|
|
|
1010
|
|
|
@param[in] entry (cfentry): Clustering feature.
|
1011
|
|
|
@param[in] search_node (cfnode): None-leaf node from that insertion should be started.
|
1012
|
|
|
|
1013
|
|
|
@return (bool) True if number of nodes at the below level is changed, otherwise False.
|
1014
|
|
|
|
1015
|
|
|
"""
|
1016
|
|
|
|
1017
|
|
|
node_amount_updation = False;
|
1018
|
|
|
|
1019
|
|
|
# Try to absorb by the entity
|
1020
|
|
|
index_nearest_entry = search_node.get_nearest_index_entry(entry, self.__type_measurement);
|
1021
|
|
|
merged_entry = search_node.entries[index_nearest_entry] + entry;
|
1022
|
|
|
|
1023
|
|
|
# Otherwise try to add new entry
|
1024
|
|
|
if (merged_entry.get_diameter() > self.__threshold):
|
1025
|
|
|
# If it's not exceeded append entity and update feature of the leaf node.
|
1026
|
|
|
search_node.insert_entry(entry);
|
1027
|
|
|
|
1028
|
|
|
# Otherwise current node should be splitted
|
1029
|
|
|
if (len(search_node.entries) > self.__max_entries):
|
1030
|
|
|
self.__split_procedure(search_node);
|
1031
|
|
|
node_amount_updation = True;
|
1032
|
|
|
|
1033
|
|
|
# Update statistics
|
1034
|
|
|
self.__amount_entries += 1;
|
1035
|
|
|
|
1036
|
|
|
else:
|
1037
|
|
|
search_node.entries[index_nearest_entry] = merged_entry;
|
1038
|
|
|
search_node.feature += entry;
|
1039
|
|
|
|
1040
|
|
|
return node_amount_updation;
|
1041
|
|
|
|
1042
|
|
|
|
1043
|
|
|
def __insert_for_noneleaf_node(self, entry, search_node):
|
1044
|
|
|
"""!
|
1045
|
|
|
@brief Recursive insert entry from none-leaf node to the tree.
|
1046
|
|
|
|
1047
|
|
|
@param[in] entry (cfentry): Clustering feature.
|
1048
|
|
|
@param[in] search_node (cfnode): None-leaf node from that insertion should be started.
|
1049
|
|
|
|
1050
|
|
|
@return (bool) True if number of nodes at the below level is changed, otherwise False.
|
1051
|
|
|
|
1052
|
|
|
"""
|
1053
|
|
|
|
1054
|
|
|
node_amount_updation = False;
|
1055
|
|
|
|
1056
|
|
|
min_key = lambda child_node: child_node.get_distance(search_node, self.__type_measurement);
|
1057
|
|
|
nearest_child_node = min(search_node.successors, key = min_key);
|
1058
|
|
|
|
1059
|
|
|
child_node_updation = self.__recursive_insert(entry, nearest_child_node);
|
1060
|
|
|
|
1061
|
|
|
# Update clustering feature of none-leaf node.
|
1062
|
|
|
search_node.feature += entry;
|
1063
|
|
|
|
1064
|
|
|
# Check branch factor, probably some leaf has been splitted and threshold has been exceeded.
|
1065
|
|
|
if (len(search_node.successors) > self.__branch_factor):
|
1066
|
|
|
|
1067
|
|
|
# Check if it's aleady root then new root should be created (height is increased in this case).
|
1068
|
|
|
if (search_node is self.__root):
|
1069
|
|
|
self.__root = non_leaf_node(search_node.feature, None, [ search_node ], None);
|
1070
|
|
|
search_node.parent = self.__root;
|
1071
|
|
|
|
1072
|
|
|
# Update statistics
|
1073
|
|
|
self.__amount_nodes += 1;
|
1074
|
|
|
self.__height += 1;
|
1075
|
|
|
|
1076
|
|
|
[new_node1, new_node2] = self.__split_nonleaf_node(search_node);
|
1077
|
|
|
|
1078
|
|
|
# Update parent list of successors
|
1079
|
|
|
parent = search_node.parent;
|
1080
|
|
|
parent.successors.remove(search_node);
|
1081
|
|
|
parent.successors.append(new_node1);
|
1082
|
|
|
parent.successors.append(new_node2);
|
1083
|
|
|
|
1084
|
|
|
# Update statistics
|
1085
|
|
|
self.__amount_nodes += 1;
|
1086
|
|
|
node_amount_updation = True;
|
1087
|
|
|
|
1088
|
|
|
elif (child_node_updation is True):
|
1089
|
|
|
# Splitting has been finished, check for possibility to merge (at least we have already two children).
|
1090
|
|
|
if (self.__merge_nearest_successors(search_node) is True):
|
1091
|
|
|
self.__amount_nodes -= 1;
|
1092
|
|
|
|
1093
|
|
|
return node_amount_updation;
|
1094
|
|
|
|
1095
|
|
|
|
1096
|
|
|
def __merge_nearest_successors(self, node):
|
1097
|
|
|
"""!
|
1098
|
|
|
@brief Find nearest sucessors and merge them.
|
1099
|
|
|
|
1100
|
|
|
@param[in] node (non_leaf_node): Node whose two nearest successors should be merged.
|
1101
|
|
|
|
1102
|
|
|
@return (bool): True if merging has been successfully performed, otherwise False.
|
1103
|
|
|
|
1104
|
|
|
"""
|
1105
|
|
|
|
1106
|
|
|
merging_result = False;
|
1107
|
|
|
|
1108
|
|
|
if (node.successors[0].type == cfnode_type.CFNODE_NONLEAF):
|
1109
|
|
|
[nearest_child_node1, nearest_child_node2] = node.get_nearest_successors(self.__type_measurement);
|
1110
|
|
|
|
1111
|
|
|
if (len(nearest_child_node1.successors) + len(nearest_child_node2.successors) <= self.__branch_factor):
|
1112
|
|
|
node.successors.remove(nearest_child_node2);
|
1113
|
|
|
if (nearest_child_node2.type == cfnode_type.CFNODE_LEAF):
|
1114
|
|
|
self.__leafes.remove(nearest_child_node2);
|
1115
|
|
|
|
1116
|
|
|
nearest_child_node1.merge(nearest_child_node2);
|
1117
|
|
|
|
1118
|
|
|
merging_result = True;
|
1119
|
|
|
|
1120
|
|
|
return merging_result;
|
1121
|
|
|
|
1122
|
|
|
|
1123
|
|
|
def __split_procedure(self, split_node):
|
1124
|
|
|
"""!
|
1125
|
|
|
@brief Starts node splitting procedure in the CF-tree from the specify node.
|
1126
|
|
|
|
1127
|
|
|
@param[in] split_node (cfnode): CF-tree node that should be splitted.
|
1128
|
|
|
|
1129
|
|
|
"""
|
1130
|
|
|
if (split_node is self.__root):
|
1131
|
|
|
self.__root = non_leaf_node(split_node.feature, None, [ split_node ], None);
|
1132
|
|
|
split_node.parent = self.__root;
|
1133
|
|
|
|
1134
|
|
|
# Update statistics
|
1135
|
|
|
self.__amount_nodes += 1;
|
1136
|
|
|
self.__height += 1;
|
1137
|
|
|
|
1138
|
|
|
[new_node1, new_node2] = self.__split_leaf_node(split_node);
|
1139
|
|
|
|
1140
|
|
|
self.__leafes.remove(split_node);
|
1141
|
|
|
self.__leafes.append(new_node1);
|
1142
|
|
|
self.__leafes.append(new_node2);
|
1143
|
|
|
|
1144
|
|
|
# Update parent list of successors
|
1145
|
|
|
parent = split_node.parent;
|
1146
|
|
|
parent.successors.remove(split_node);
|
1147
|
|
|
parent.successors.append(new_node1);
|
1148
|
|
|
parent.successors.append(new_node2);
|
1149
|
|
|
|
1150
|
|
|
# Update statistics
|
1151
|
|
|
self.__amount_nodes += 1;
|
1152
|
|
|
|
1153
|
|
|
|
1154
|
|
|
def __split_nonleaf_node(self, node):
|
1155
|
|
|
"""!
|
1156
|
|
|
@brief Performs splitting of the specified non-leaf node.
|
1157
|
|
|
|
1158
|
|
|
@param[in] node (non_leaf_node): Non-leaf node that should be splitted.
|
1159
|
|
|
|
1160
|
|
|
@return (list) New pair of non-leaf nodes [non_leaf_node1, non_leaf_node2].
|
1161
|
|
|
|
1162
|
|
|
"""
|
1163
|
|
|
|
1164
|
|
|
[farthest_node1, farthest_node2] = node.get_farthest_successors(self.__type_measurement);
|
1165
|
|
|
|
1166
|
|
|
# create new non-leaf nodes
|
1167
|
|
|
new_node1 = non_leaf_node(farthest_node1.feature, node.parent, [ farthest_node1 ], None);
|
1168
|
|
|
new_node2 = non_leaf_node(farthest_node2.feature, node.parent, [ farthest_node2 ], None);
|
1169
|
|
|
|
1170
|
|
|
farthest_node1.parent = new_node1;
|
1171
|
|
|
farthest_node2.parent = new_node2;
|
1172
|
|
|
|
1173
|
|
|
# re-insert other successors
|
1174
|
|
|
for successor in node.successors:
|
1175
|
|
|
if ( (successor is not farthest_node1) and (successor is not farthest_node2) ):
|
1176
|
|
|
distance1 = new_node1.get_distance(successor, self.__type_measurement);
|
1177
|
|
|
distance2 = new_node2.get_distance(successor, self.__type_measurement);
|
1178
|
|
|
|
1179
|
|
|
if (distance1 < distance2):
|
1180
|
|
|
new_node1.insert_successor(successor);
|
1181
|
|
|
else:
|
1182
|
|
|
new_node2.insert_successor(successor);
|
1183
|
|
|
|
1184
|
|
|
return [new_node1, new_node2];
|
1185
|
|
|
|
1186
|
|
|
|
1187
|
|
|
def __split_leaf_node(self, node):
|
1188
|
|
|
"""!
|
1189
|
|
|
@brief Performs splitting of the specified leaf node.
|
1190
|
|
|
|
1191
|
|
|
@param[in] node (leaf_node): Leaf node that should be splitted.
|
1192
|
|
|
|
1193
|
|
|
@return (list) New pair of leaf nodes [leaf_node1, leaf_node2].
|
1194
|
|
|
|
1195
|
|
|
@warning Splitted node is transformed to non_leaf.
|
1196
|
|
|
|
1197
|
|
|
"""
|
1198
|
|
|
|
1199
|
|
|
# search farthest pair of entries
|
1200
|
|
|
[farthest_entity1, farthest_entity2] = node.get_farthest_entries(self.__type_measurement);
|
1201
|
|
|
|
1202
|
|
|
# create new nodes
|
1203
|
|
|
new_node1 = leaf_node(farthest_entity1, node.parent, [ farthest_entity1 ], None);
|
1204
|
|
|
new_node2 = leaf_node(farthest_entity2, node.parent, [ farthest_entity2 ], None);
|
1205
|
|
|
|
1206
|
|
|
# re-insert other entries
|
1207
|
|
|
for entity in node.entries:
|
1208
|
|
|
if ( (entity is not farthest_entity1) and (entity is not farthest_entity2) ):
|
1209
|
|
|
distance1 = new_node1.feature.get_distance(entity, self.__type_measurement);
|
1210
|
|
|
distance2 = new_node2.feature.get_distance(entity, self.__type_measurement);
|
1211
|
|
|
|
1212
|
|
|
if (distance1 < distance2):
|
1213
|
|
|
new_node1.insert_entry(entity);
|
1214
|
|
|
else:
|
1215
|
|
|
new_node2.insert_entry(entity);
|
1216
|
|
|
|
1217
|
|
|
return [new_node1, new_node2];
|
1218
|
|
|
|
1219
|
|
|
|
1220
|
|
|
def show_feature_destibution(self, data = None):
|
1221
|
|
|
"""!
|
1222
|
|
|
@brief Shows feature distribution.
|
1223
|
|
|
@details Only features in 1D, 2D, 3D space can be visualized.
|
1224
|
|
|
|
1225
|
|
|
@param[in] data (list): List of points that will be used for visualization, if it not specified than feature will be displayed only.
|
1226
|
|
|
|
1227
|
|
|
"""
|
1228
|
|
|
visualizer = cluster_visualizer();
|
1229
|
|
|
|
1230
|
|
|
print("amount of nodes: ", self.__amount_nodes);
|
1231
|
|
|
|
1232
|
|
|
if (data is not None):
|
1233
|
|
|
visualizer.append_cluster(data, marker = 'x');
|
1234
|
|
|
|
1235
|
|
|
for level in range(0, self.height):
|
1236
|
|
|
level_nodes = self.get_level_nodes(level);
|
1237
|
|
|
|
1238
|
|
|
centers = [ node.feature.get_centroid() for node in level_nodes ];
|
1239
|
|
|
visualizer.append_cluster(centers, None, markersize = (self.height - level + 1) * 5);
|
1240
|
|
|
|
1241
|
|
|
visualizer.show(); |
It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior: