Completed
Push — master ( b2f733...eb13df )
by Andrei
02:02
created

cftree.root()   A

Complexity

Conditions 1

Size

Total Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
dl 0
loc 7
rs 9.4285
c 0
b 0
f 0
1
"""!
2
3
@brief Data Structure: CF-Tree
4
@details Based on book description:
5
         - M.Zhang, R.Ramakrishnan, M.Livny. BIRCH: An Efficient Data Clustering Method for Very Large Databases. 1996.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2016
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
from copy import copy;
29
30
from pyclustering.cluster import cluster_visualizer;
31
32
from pyclustering.utils import euclidean_distance_sqrt;
33
from pyclustering.utils import manhattan_distance;
34
from pyclustering.utils import list_math_addition, list_math_subtraction, list_math_multiplication;
35
from pyclustering.utils import linear_sum, square_sum;
36
37
from enum import IntEnum;
38
39
40
class measurement_type(IntEnum):
41
    """!
42
    @brief Enumeration of measurement types for CF-Tree.
43
    
44
    @see cftree
45
    
46
    """
47
    
48
    ## Euclidian distance between centroids of clustering features.
49
    CENTROID_EUCLIDIAN_DISTANCE     = 0;
50
    
51
    ## Manhattan distance between centroids of clustering features.
52
    CENTROID_MANHATTAN_DISTANCE     = 1;
53
    
54
    ## Average distance between all objects from clustering features.
55
    AVERAGE_INTER_CLUSTER_DISTANCE  = 2;
56
    
57
    ## Average distance between all objects within clustering features and between them.
58
    AVERAGE_INTRA_CLUSTER_DISTANCE  = 3;
59
    
60
    ## Variance based distance between clustering features.
61
    VARIANCE_INCREASE_DISTANCE      = 4;
62
63
64
class cfnode_type(IntEnum):
65
    """!
66
    @brief Enumeration of CF-Node types that are used by CF-Tree.
67
    
68
    @see cfnode
69
    @see cftree
70
    
71
    """
72
    
73
    ## Undefined node.
74
    CFNODE_DUMMY    = 0;
75
    
76
    ## Leaf node hasn't got successors, only entries.
77
    CFNODE_LEAF     = 1;
78
    
79
    ## Non-leaf node has got successors and hasn't got entries.
80
    CFNODE_NONLEAF  = 2;
81
82
83
class cftree_insert_status(IntEnum):
84
    """!
85
    
86
    """
87
    CFTREE_INSERT_ABSORB         = 0;
88
    
89
    CFTREE_INSERT_NODE_SPLITTED  = 1;
90
    
91
    CFTREE_INSERT_IGNORED        = 2;
92
93
94
class cfentry:
95
    """!
96
    @brief Clustering feature representation.
97
    
98
    @see cfnode
99
    @see cftree
100
    
101
    """
102
    
103
    @property
104
    def number_points(self):
105
        """!
106
        @brief Returns number of points that are encoded.
107
        
108
        @return (uint) Number of encoded points.
109
        
110
        """
111
        return self.__number_points;
112
    
113
    @property
114
    def linear_sum(self):
115
        """!
116
        @brief Returns linear sum.
117
        
118
        @return (list) Linear sum.
119
        
120
        """
121
        
122
        return self.__linear_sum;
123
    
124
    @property
125
    def square_sum(self):
126
        """!
127
        @brief Returns square sum.
128
        
129
        @return (double) Square sum.
130
        
131
        """
132
        return self.__square_sum;
133
    
134
    
135
    def __init__(self, number_points, linear_sum, square_sum):
0 ignored issues
show
Comprehensibility Bug introduced by
square_sum is re-defining a name which is already available in the outer-scope (previously defined on line 35).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
Comprehensibility Bug introduced by
linear_sum is re-defining a name which is already available in the outer-scope (previously defined on line 35).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
136
        """!
137
        @brief CF-entry constructor.
138
        
139
        @param[in] number_points (uint): Number of objects that is represented by the entry.
140
        @param[in] linear_sum (list): Linear sum of values that represent objects in each dimension.
141
        @param[in] square_sum (double): Square sum of values that represent objects.
142
        
143
        """
144
        
145
        self.__number_points = number_points;
146
        self.__linear_sum = linear_sum;
147
        self.__square_sum = square_sum;
148
        
149
        self.__centroid = None;
150
        self.__radius = None;
151
        self.__diameter = None;
152
    
153
    
154
    def __copy__(self):
155
        """!
156
        @returns (cfentry) Makes copy of the cfentry instance.
157
        
158
        """
159
        return cfentry(self.__number_points, self.__linear_sum, self.__square_sum);
160
    
161
    
162
    def __repr__(self):
163
        """!
164
        @return (string) Default cfentry representation.
165
        
166
        """
167
        return 'CF (%s, %s, %0.2f) [%s]' % ( self.number_points, self.linear_sum, self.__square_sum, hex(id(self)) );    
168
    
169
    
170
    def __str__(self):
171
        """!
172
        @brief Default cfentry string representation.
173
        
174
        """
175
        return self.__repr__();
176
    
177
    
178
    def __add__(self, entry):
179
        """!
180
        @brief Overloaded operator add. Performs addition of two clustering features.
181
        
182
        @param[in] entry (cfentry): Entry that is added to the current.
183
        
184
        @return (cfentry) Result of addition of two clustering features.
185
        
186
        """
187
        
188
        number_points = self.number_points + entry.number_points;
189
        result_linear_sum = list_math_addition(self.linear_sum, entry.linear_sum);
190
        result_square_sum = self.square_sum + entry.square_sum;  
191
        
192
        return cfentry(number_points, result_linear_sum, result_square_sum);
193
    
194
    
195
    def __sub__(self, entry):
196
        """!
197
        @brief Overloaded operator sub. Performs substraction of two clustering features.
198
        @details Substraction can't be performed with clustering feature whose description is less then substractor.
199
        
200
        @param[in] entry (cfentry): Entry that is substracted from the current.
201
        
202
        @return (cfentry) Result of substraction of two clustering features.
203
        
204
        """
205
                
206
        number_points = self.number_points - entry.number_points;
207
        result_linear_sum = list_math_subtraction(self.linear_sum, entry.linear_sum);
208
        result_square_sum = self.square_sum - entry.square_sum;
209
        
210
        if ( (number_points < 0) or (result_square_sum < 0) ):
211
            raise NameError("Substruction with negative result is not possible for clustering features.");
212
        
213
        return cfentry(number_points, result_linear_sum, result_square_sum);
214
    
215
    
216
    def __eq__(self, entry):
217
        """!
218
        @brief Overloaded operator eq. 
219
        @details Performs comparison of two clustering features.
220
        
221
        @param[in] entry (cfentry): Entry that is used for comparison with current.
222
        
223
        @return (bool) True is both clustering features are equals in line with tolerance, otherwise False.
224
        
225
        """
226
                
227
        tolerance = 0.00001;
228
        
229
        result = (self.number_points == entry.number_points);
230
        result &= ( (self.square_sum + tolerance > entry.square_sum) and (self.square_sum - tolerance < entry.square_sum) );
231
        
232
        for index_dimension in range(0, len(self.linear_sum)):
233
            result &= ( (self.linear_sum[index_dimension] + tolerance > entry.linear_sum[index_dimension]) and (self.linear_sum[index_dimension] - tolerance < entry.linear_sum[index_dimension]) );
234
        
235
        return result;
236
    
237
    
238
    def get_distance(self, entry, type_measurement):
239
        """!
240
        @brief Calculates distance between two clusters in line with measurement type.
241
        
242
        @details In case of usage CENTROID_EUCLIDIAN_DISTANCE square euclidian distance will be returned.
243
                 Square root should be taken from the result for obtaining real euclidian distance between
244
                 entries. 
245
        
246
        @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
247
        @param[in] type_measurement (measurement_type): Distance measurement algorithm between two clusters.
248
        
249
        @return (double) Distance between two clusters.
250
        
251
        """
252
        
253
        if (type_measurement is measurement_type.CENTROID_EUCLIDIAN_DISTANCE):
254
            return euclidean_distance_sqrt(entry.get_centroid(), self.get_centroid());
255
        
256
        elif (type_measurement is measurement_type.CENTROID_MANHATTAN_DISTANCE):
257
            return manhattan_distance(entry.get_centroid(), self.get_centroid());
258
        
259
        elif (type_measurement is measurement_type.AVERAGE_INTER_CLUSTER_DISTANCE):
260
            return self.__get_average_inter_cluster_distance(entry);
261
            
262
        elif (type_measurement is measurement_type.AVERAGE_INTRA_CLUSTER_DISTANCE):
263
            return self.__get_average_intra_cluster_distance(entry);
264
        
265
        elif (type_measurement is measurement_type.VARIANCE_INCREASE_DISTANCE):
266
            return self.__get_variance_increase_distance(entry);
267
        
268
        else:
269
            assert 0;
270
    
271
        
272
    def get_centroid(self):
273
        """!
274
        @brief Calculates centroid of cluster that is represented by the entry. 
275
        @details It's calculated once when it's requested after the last changes.
276
        
277
        @return (list) Centroid of cluster that is represented by the entry.
278
        
279
        """
280
        
281
        if (self.__centroid is not None):
282
            return self.__centroid;
283
        
284
        self.__centroid = [0] * len(self.linear_sum);
285
        for index_dimension in range(0, len(self.linear_sum)):
286
            self.__centroid[index_dimension] = self.linear_sum[index_dimension] / self.number_points;
287
        
288
        return self.__centroid;
289
    
290
    
291
    def get_radius(self):
292
        """!
293
        @brief Calculates radius of cluster that is represented by the entry.
294
        @details It's calculated once when it's requested after the last changes.
295
        
296
        @return (double) Radius of cluster that is represented by the entry.
297
        
298
        """
299
        
300
        if (self.__radius is not None):
301
            return self.__radius;
302
        
303
        centroid = self.get_centroid();
304
        
305
        radius_part_1 = self.square_sum;
306
        
307
        radius_part_2 = 0.0;
308
        radius_part_3 = 0.0;
309
        
310
        if (type(centroid) == list):
311
            radius_part_2 = 2.0 * sum(list_math_multiplication(self.linear_sum, centroid));
312
            radius_part_3 = self.number_points * sum(list_math_multiplication(centroid, centroid));
313
        else:
314
            radius_part_2 = 2.0 * self.linear_sum * centroid;
315
            radius_part_3 = self.number_points * centroid * centroid;
316
        
317
        self.__radius = ( (1.0 / self.number_points) * (radius_part_1 - radius_part_2 + radius_part_3) ) ** 0.5;
318
        return self.__radius;
319
        
320
    
321
    def get_diameter(self):
322
        """!
323
        @brief Calculates diameter of cluster that is represented by the entry.
324
        @details It's calculated once when it's requested after the last changes.
325
        
326
        @return (double) Diameter of cluster that is represented by the entry.
327
        
328
        """
329
        
330
        if (self.__diameter is not None):
331
            return self.__diameter;
332
        
333
        diameter_part = 0.0;
334
        if (type(self.linear_sum) == list):
335
            diameter_part = self.square_sum * self.number_points - 2.0 * sum(list_math_multiplication(self.linear_sum, self.linear_sum)) + self.square_sum * self.number_points;
336
        else:
337
            diameter_part = self.square_sum * self.number_points - 2.0 * self.linear_sum * self.linear_sum + self.square_sum * self.number_points;
338
            
339
        self.__diameter = ( diameter_part / (self.number_points * (self.number_points - 1)) ) ** 0.5;
340
        return self.__diameter;
341
    
342
        
343
    def __get_average_inter_cluster_distance(self, entry):
344
        """!
345
        @brief Calculates average inter cluster distance between current and specified clusters.
346
        
347
        @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
348
        
349
        @return (double) Average inter cluster distance.
350
        
351
        """
352
        
353
        linear_part_distance = sum(list_math_multiplication(self.linear_sum, entry.linear_sum));
354
        
355
        return ( (entry.number_points * self.square_sum - 2.0 * linear_part_distance + self.number_points * entry.square_sum) / (self.number_points * entry.number_points) ) ** 0.5;
356
    
357
    
358
    def __get_average_intra_cluster_distance(self, entry):
359
        """!
360
        @brief Calculates average intra cluster distance between current and specified clusters.
361
        
362
        @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
363
        
364
        @return (double) Average intra cluster distance.
365
        
366
        """
367
        
368
        linear_part_first = list_math_addition(self.linear_sum, entry.linear_sum);
369
        linear_part_second = linear_part_first;
370
        
371
        linear_part_distance = sum(list_math_multiplication(linear_part_first, linear_part_second));
372
        
373
        general_part_distance = 2.0 * (self.number_points + entry.number_points) * (self.square_sum + entry.square_sum) - 2.0 * linear_part_distance;
374
        
375
        return (general_part_distance / ( (self.number_points + entry.number_points) * (self.number_points + entry.number_points - 1.0) )) ** 0.5;
376
    
377
    
378
    def __get_variance_increase_distance(self, entry):
379
        """!
380
        @brief Calculates variance increase distance between current and specified clusters.
381
        
382
        @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
383
        
384
        @return (double) Variance increase distance.
385
        
386
        """
387
                
388
        linear_part_12 = list_math_addition(self.linear_sum, entry.linear_sum);
389
        variance_part_first = (self.square_sum + entry.square_sum) - \
390
            2.0 * sum(list_math_multiplication(linear_part_12, linear_part_12)) / (self.number_points + entry.number_points) + \
391
            (self.number_points + entry.number_points) * sum(list_math_multiplication(linear_part_12, linear_part_12)) / (self.number_points + entry.number_points)**2.0;
392
393
        
394
        linear_part_11 = sum(list_math_multiplication(self.linear_sum, self.linear_sum));
395
        variance_part_second = -( self.square_sum - (2.0 * linear_part_11 / self.number_points) + (linear_part_11 / self.number_points) );
396
        
397
        linear_part_22 = sum(list_math_multiplication(entry.linear_sum, entry.linear_sum));
398
        variance_part_third = -( entry.square_sum - (2.0 / entry.number_points) * linear_part_22 + entry.number_points * (1.0 / entry.number_points ** 2.0) * linear_part_22 );
399
400
        return (variance_part_first + variance_part_second + variance_part_third);
401
        
402
403
class cfnode:
404
    """!
405
    @brief Representation of node of CF-Tree.
406
    
407
    """
408
    
409
    def __init__(self, feature, parent, payload):
410
        """!
411
        @brief Constructor of abstract CF node.
412
        
413
        @param[in] feature (cfentry): Clustering feature of the created node.
414
        @param[in] parent (cfnode): Parent of the created node.
415
        @param[in] payload (*): Data that is stored by the node.
416
        
417
        """
418
        
419
        ## Clustering feature of the node.
420
        self.feature = copy(feature);
421
        
422
        ## Pointer to the parent node (None for root).
423
        self.parent = parent;
424
        
425
        ## Type node (leaf or non-leaf).
426
        self.type = cfnode_type.CFNODE_DUMMY;
427
        
428
        ## Payload of node where user data can be stored.
429
        self.payload = payload;
430
    
431
    
432
    def __repr__(self):
433
        """!
434
        @return (string) Default representation of CF node.
435
        
436
        """
437
        
438
        return 'CF node %s, parent %s, feature %s' % ( hex(id(self)), self.parent, self.feature );
439
440
441
    def __str__(self):
442
        """!
443
        @return (string) String representation of CF node.
444
        
445
        """
446
        return self.__repr__();
447
    
448
    
449
    def get_distance(self, node, type_measurement):
450
        """!
451
        @brief Calculates distance between nodes in line with specified type measurement.
452
        
453
        @param[in] node (cfnode): CF-node that is used for calculation distance to the current node.
454
        @param[in] type_measurement (measurement_type): Measurement type that is used for calculation distance.
455
        
456
        @return (double) Distance between two nodes.
457
        
458
        """
459
        
460
        return self.feature.get_distance(node.feature, type_measurement);
461
    
462
463
class non_leaf_node(cfnode):
464
    """!
465
    @brief Representation of clustering feature non-leaf node.
466
    
467
    """ 
468
    
469
    @property
470
    def successors(self):
471
        """!
472
        @return (list) List of successors of the node.
473
        
474
        """
475
        return self.__successors;
476
    
477
    
478
    def __init__(self, feature, parent, successors, payload):
479
        """!
480
        @brief Create CF Non-leaf node.
481
        
482
        @param[in] feature (cfentry): Clustering feature of the created node.
483
        @param[in] parent (non_leaf_node): Parent of the created node.
484
        @param[in] successors (list): List of successors of the node.
485
        @param[in] payload (*): Data that is stored by the node.
486
        
487
        """
488
                
489
        super().__init__(feature, parent, payload);
490
        
491
        ## Node type in CF tree that is CFNODE_NONLEAF for non leaf node.
492
        self.type = cfnode_type.CFNODE_NONLEAF;
493
        
494
        self.__successors = successors;
495
    
496
    
497
    def __repr__(self):
498
        """!
499
        @return (string) Representation of non-leaf node representation.
500
        
501
        """   
502
        return 'Non-leaf node %s, parent %s, feature %s, successors: %d' % ( hex(id(self)), self.parent, self.feature, len(self.successors) );
503
    
504
    
505
    def __str__(self):
506
        """!
507
        @return (string) String non-leaf representation.
508
        
509
        """
510
        return self.__repr__();
511
    
512
    
513
    def insert_successor(self, successor):
514
        """!
515
        @brief Insert successor to the node.
516
        
517
        @param[in] successor (cfnode): Successor for adding.
518
        
519
        """
520
        
521
        self.feature += successor.feature;
522
        self.successors.append(successor);
523
        
524
        successor.parent = self;
525
    
526
    
527
    def remove_successor(self, successor):
528
        """!
529
        @brief Remove successor from the node.
530
        
531
        @param[in] successor (cfnode): Successor for removing.
532
        
533
        """
534
        
535
        self.feature -= successor.feature;
536
        self.successors.append(successor);
537
        
538
        successor.parent = self;
539
    
540
    
541
    def merge(self, node):
542
        """!
543
        @brief Merge non-leaf node to the current.
544
        
545
        @param[in] node (non_leaf_node): Non-leaf node that should be merged with current.
546
        
547
        """
548
                
549
        self.feature += node.feature;
550
        
551
        for child in node.successors:
552
            child.parent = self;
553
            self.successors.append(child);
554
    
555
    
556
    def get_farthest_successors(self, type_measurement):
557
        """!
558
        @brief Find pair of farthest successors of the node in line with measurement type.
559
        
560
        @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining farthest successors.
561
        
562
        @return (list) Pair of farthest successors represented by list [cfnode1, cfnode2].
563
        
564
        """
565
        
566
        farthest_node1 = None;
567
        farthest_node2 = None;
568
        farthest_distance = 0;
569
        
570 View Code Duplication
        for i in range(0, len(self.successors)):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
571
            candidate1 = self.successors[i];
572
            
573
            for j in range(i + 1, len(self.successors)):
574
                candidate2 = self.successors[j];
575
                candidate_distance = candidate1.get_distance(candidate2, type_measurement);
576
                
577
                if (candidate_distance > farthest_distance):
578
                    farthest_distance = candidate_distance;
579
                    farthest_node1 = candidate1;
580
                    farthest_node2 = candidate2;        
581
        
582
                    return [farthest_node1, farthest_node2];
583
    
584
    
585
    def get_nearest_successors(self, type_measurement):
586
        """!
587
        @brief Find pair of nearest successors of the node in line with measurement type.
588
        
589
        @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest successors.
590
        
591
        @return (list) Pair of nearest successors represented by list.
592
        
593
        """
594
                
595
        nearest_node1 = None;
596
        nearest_node2 = None;
597
        nearest_distance = float("Inf");
598
        
599
        for i in range(0, len(self.successors)):
600
            candidate1 = self.successors[i];
601
            
602
            for j in range(i + 1, len(self.successors)):
603
                candidate2 = self.successors[j];
604
                candidate_distance = candidate1.get_distance(candidate2, type_measurement);
605
                
606
                if (candidate_distance < nearest_distance):
607
                    nearest_distance = candidate_distance;
608
                    nearest_node1 = candidate1;
609
                    nearest_node2 = candidate2;
610
        
611
        return [nearest_node1, nearest_node2];
612
613
614
class leaf_node(cfnode):
615
    """!
616
    @brief Represents clustering feature leaf node.
617
    
618
    """
619
    
620
    @property
621
    def entries(self):
622
        """!
623
        @return (list) List of leaf nodes.
624
        
625
        """
626
        return self.__entries;
627
    
628
    
629
    def __init__(self, feature, parent, entries, payload):
630
        """!
631
        @brief Create CF Leaf node.
632
        
633
        @param[in] feature (cfentry): Clustering feature of the created node.
634
        @param[in] parent (non_leaf_node): Parent of the created node.
635
        @param[in] entries (list): List of entries of the node.
636
        @param[in] payload (*): Data that is stored by the node.
637
        
638
        """
639
        
640
        super().__init__(feature, parent, payload);
641
        
642
        ## Node type in CF tree that is CFNODE_LEAF for leaf node.
643
        self.type = cfnode_type.CFNODE_LEAF;
644
        
645
        self.__entries = entries;   # list of clustering features
646
        
647
    
648
    def __repr__(self):
649
        """!
650
        @return (string) Default leaf node represenation.
651
        
652
        """
653
        text_entries = "\n";
654
        for entry in self.entries:
655
            text_entries += "\t" + str(entry) + "\n";
656
        
657
        return 'Leaf-node %s, parent %s, feature %s, entries: %d %s' % ( hex(id(self)), self.parent, self.feature, len(self.entries), text_entries );
658
    
659
    
660
    def __str__(self):
661
        """!
662
        @return (string) String leaf node representation.
663
        
664
        """
665
        return self.__repr__();
666
    
667
    
668
    def insert_entry(self, entry):
669
        """!
670
        @brief Insert new clustering feature to the leaf node.
671
        
672
        @param[in] entry (cfentry): Clustering feature.
673
        
674
        """
675
                              
676
        self.feature += entry;
677
        self.entries.append(entry);
678
        
679
    
680
    def remove_entry(self, entry):
681
        """!
682
        @brief Remove clustering feature from the leaf node.
683
        
684
        @param[in] entry (cfentry): Clustering feature.
685
        
686
        """
687
                
688
        self.feature -= entry;
689
        self.entries.remove(entry);
690
    
691
    
692
    def merge(self, node):
693
        """!
694
        @brief Merge leaf node to the current.
695
        
696
        @param[in] node (leaf_node): Leaf node that should be merged with current.
697
        
698
        """
699
        
700
        self.feature += node.feature;
701
        
702
        # Move entries from merged node
703
        for entry in node.entries:
704
            self.entries.append(entry);
705
            
706
    
707
    def get_farthest_entries(self, type_measurement):
708
        """!
709
        @brief Find pair of farthest entries of the node.
710
        
711
        @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining farthest entries.
712
        
713
        @return (list) Pair of farthest entries of the node that are represented by list.
714
        
715
        """
716
        
717
        farthest_entity1 = None;
718
        farthest_entity2 = None;
719
        farthest_distance = 0;
720
        
721
        for i in range(0, len(self.entries)):
722
            candidate1 = self.entries[i];
723
            
724
            for j in range(i + 1, len(self.entries)):
725
                candidate2 = self.entries[j];
726
                candidate_distance = candidate1.get_distance(candidate2, type_measurement);
727
                
728
                if (candidate_distance > farthest_distance):
729
                    farthest_distance = candidate_distance;
730
                    farthest_entity1 = candidate1;
731
                    farthest_entity2 = candidate2;
732
        
733
        return [farthest_entity1, farthest_entity2];
734
    
735
    
736
    def get_nearest_index_entry(self, entry, type_measurement):
737
        """!
738
        @brief Find nearest index of nearest entry of node for the specified entry.
739
        
740
        @param[in] entry (cfentry): Entry that is used for calculation distance.
741
        @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest entry to the specified.
742
        
743
        @return (uint) Index of nearest entry of node for the specified entry.
744
        
745
        """
746
        
747
        minimum_distance = float('Inf');
748
        nearest_index = 0;
749
        
750
        for candidate_index in range(0, len(self.entries)):
751
            candidate_distance = self.entries[candidate_index].get_distance(entry, type_measurement);
752
            if (candidate_distance < minimum_distance):
753
                nearest_index = candidate_index;
754
        
755
        return nearest_index;
756
    
757
    
758
    def get_nearest_entry(self, entry, type_measurement):
759
        """!
760
        @brief Find nearest entry of node for the specified entry.
761
        
762
        @param[in] entry (cfentry): Entry that is used for calculation distance.
763
        @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest entry to the specified.
764
        
765
        @return (cfentry) Nearest entry of node for the specified entry.
766
        
767
        """
768
        
769
        min_key = lambda cur_entity: cur_entity.get_distance(entry, type_measurement);
770
        return min(self.__entries, key = min_key);
771
772
773
class cftree:
774
    """!
775
    @brief CF-Tree representation.
776
    @details A CF-tree is a height-balanced tree with two parameters: branching factor and threshold.
777
    
778
    """
779
    
780
    @property
781
    def root(self):
782
        """!
783
        @return (cfnode) Root of the tree.
784
        
785
        """
786
        return self.__root;
787
    
788
    
789
    @property
790
    def leafes(self):
791
        """!
792
        @return (list) List of all non-leaf nodes in the tree.
793
        
794
        """
795
        return self.__leafes;
796
    
797
    
798
    @property
799
    def amount_nodes(self):
800
        """!
801
        @return (unit) Number of nodes (leaf and non-leaf) in the tree.
802
        
803
        """
804
        return self.__amount_nodes;
805
    
806
    
807
    @property
808
    def amount_entries(self):
809
        """!
810
        @return (uint) Number of entries in the tree.
811
        
812
        """
813
        return self.__amount_entries;
814
    
815
    
816
    @property
817
    def height(self):
818
        """!
819
        @return (uint) Height of the tree.
820
        
821
        """
822
        return self.__height;
823
    
824
    
825
    @property
826
    def branch_factor(self):
827
        """!
828
        @return (uint) Branching factor of the tree.
829
        @details Branching factor defines maximum number of successors in each non-leaf node.
830
        
831
        """
832
        return self.__branch_factor;
833
    
834
    
835
    @property
836
    def threshold(self):
837
        """!
838
        @return (double) Threshold of the tree that represents maximum diameter of sub-clusters that is formed by leaf node entries.
839
        
840
        """
841
        return self.__threshold;
842
    
843
    
844
    @property
845
    def max_entries(self):
846
        """!
847
        @return (uint) Maximum number of entries in each leaf node.
848
        
849
        """
850
        return self.__max_entries;
851
    
852
    
853
    @property
854
    def type_measurement(self):
855
        """!
856
        @return (measurement_type) Type that is used for measuring.
857
        
858
        """
859
        return self.__type_measurement;
860
    
861
    
862
    def __init__(self, branch_factor, max_entries, threshold, type_measurement = measurement_type.CENTROID_EUCLIDIAN_DISTANCE):
863
        """!
864
        @brief Create CF-tree.
865
        
866
        @param[in] branch_factor (uint): Maximum number of children for non-leaf nodes.
867
        @param[in] max_entries (uint): Maximum number of entries for leaf nodes.
868
        @param[in] threshold (double): Maximum diameter of feature clustering for each leaf node.
869
        @param[in] type_measurement (measurement_type): Measurement type that is used for calculation distance metrics.
870
        
871
        """
872
873
        self.__root = None;
874
875
        self.__branch_factor = branch_factor; # maximum number of children
876
        if (self.__branch_factor < 2):
877
            self.__branch_factor = 2;
878
        
879
        
880
        self.__threshold = threshold;         # maximum diameter of sub-clusters stored at the leaf nodes
881
        self.__max_entries = max_entries;
882
        
883
        self.__leafes = [];
884
        
885
        self.__type_measurement = type_measurement;
886
        
887
        # statistics
888
        self.__amount_nodes = 0;    # root, despite it can be None.
889
        self.__amount_entries = 0;
890
        self.__height = 0;          # tree size with root.
891
    
892
    
893
    def get_level_nodes(self, level):
894
        """!
895
        @brief Traverses CF-tree to obtain nodes at the specified level.
896
        
897
        @param[in] level (uint): CF-tree level from that nodes should be returned.
898
        
899
        @return (list) List of CF-nodes that are located on the specified level of the CF-tree.
900
        
901
        """
902
        
903
        level_nodes = [];
904
        if (level < self.__height):
905
            level_nodes = self.__recursive_get_level_nodes(level, self.__root);
906
        
907
        return level_nodes;
908
    
909
    
910
    def __recursive_get_level_nodes(self, level, node):
911
        """!
912
        @brief Traverses CF-tree to obtain nodes at the specified level recursively.
913
        
914
        @param[in] level (uint): Current CF-tree level.
915
        @param[in] node (cfnode): CF-node from that traversing is performed.
916
        
917
        @return (list) List of CF-nodes that are located on the specified level of the CF-tree.
918
        
919
        """
920
        
921
        level_nodes = [];
922
        if (level is 0):
923
            level_nodes.append(node);
924
        
925
        else:
926
            for sucessor in node.successors:
927
                level_nodes += self.__recursive_get_level_nodes(level - 1, sucessor);
928
        
929
        return level_nodes;
930
    
931
    
932
    def insert_cluster(self, cluster):
933
        """!
934
        @brief Insert cluster that is represented as list of points where each point is represented by list of coordinates.
935
        @details Clustering feature is created for that cluster and inserted to the tree.
936
        
937
        @param[in] cluster (list): Cluster that is represented by list of points that should be inserted to the tree.
938
        
939
        """
940
        
941
        entry = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster));
942
        self.insert(entry);
943
    
944
    
945
    def insert(self, entry):
946
        """!
947
        @brief Insert clustering feature to the tree.
948
        
949
        @param[in] entry (cfentry): Clustering feature that should be inserted.
950
        
951
        """
952
                
953
        if (self.__root is None):
954
            node = leaf_node(entry, None, [ entry ], None);
955
            
956
            self.__root = node;
957
            self.__leafes.append(node);
958
            
959
            # Update statistics
960
            self.__amount_entries += 1;
961
            self.__amount_nodes += 1;
962
            self.__height += 1;             # root has successor now
963
        else:
964
            child_node_updation = self.__recursive_insert(entry, self.__root);
965
            if (child_node_updation is True):
966
                # Splitting has been finished, check for possibility to merge (at least we have already two children).
967
                if (self.__merge_nearest_successors(self.__root) is True):
968
                    self.__amount_nodes -= 1;
969
    
970
    
971
    def find_nearest_leaf(self, entry, search_node = None):
972
        """!
973
        @brief Search nearest leaf to the specified clustering feature.
974
        
975
        @param[in] entry (cfentry): Clustering feature.
976
        @param[in] search_node (cfnode): Node from that searching should be started, if None then search process will be started for the root.
977
        
978
        @return (leaf_node) Nearest node to the specified clustering feature.
979
        
980
        """
981
        
982
        if (search_node is None):
983
            search_node = self.__root;
984
        
985
        nearest_node = search_node;
986
        
987
        if (search_node.type == cfnode_type.CFNODE_NONLEAF):
988
            min_key = lambda child_node: child_node.feature.get_distance(entry, self.__type_measurement);
989
            nearest_child_node = min(search_node.successors, key = min_key);
990
            
991
            nearest_node = self.find_nearest_leaf(entry, nearest_child_node);
992
        
993
        return nearest_node;
994
    
995
    
996
    def __recursive_insert(self, entry, search_node):
997
        """!
998
        @brief Recursive insert of the entry to the tree.
999
        @details It performs all required procedures during insertion such as splitting, merging.
1000
        
1001
        @param[in] entry (cfentry): Clustering feature.
1002
        @param[in] search_node (cfnode): Node from that insertion should be started.
1003
        
1004
        @return (bool) True if number of nodes at the below level is changed, otherwise False.
1005
        
1006
        """
1007
        
1008
        # None-leaf node
1009
        if (search_node.type == cfnode_type.CFNODE_NONLEAF):
1010
            return self.__insert_for_noneleaf_node(entry, search_node);
1011
        
1012
        # Leaf is reached 
1013
        else:
1014
            return self.__insert_for_leaf_node(entry, search_node);
1015
    
1016
    
1017
    def __insert_for_leaf_node(self, entry, search_node):
1018
        """!
1019
        @brief Recursive insert entry from leaf node to the tree.
1020
        
1021
        @param[in] entry (cfentry): Clustering feature.
1022
        @param[in] search_node (cfnode): None-leaf node from that insertion should be started.
1023
        
1024
        @return (bool) True if number of nodes at the below level is changed, otherwise False.
1025
        
1026
        """
1027
        
1028
        node_amount_updation = False;
1029
        
1030
        # Try to absorb by the entity
1031
        index_nearest_entry = search_node.get_nearest_index_entry(entry, self.__type_measurement);
1032
        merged_entry = search_node.entries[index_nearest_entry] + entry;
1033
        
1034
        # Otherwise try to add new entry
1035
        if (merged_entry.get_diameter() > self.__threshold):
1036
            # If it's not exceeded append entity and update feature of the leaf node.
1037
            search_node.insert_entry(entry);
1038
            
1039
            # Otherwise current node should be splitted
1040
            if (len(search_node.entries) > self.__max_entries):
1041
                self.__split_procedure(search_node);
1042
                node_amount_updation = True;
1043
            
1044
            # Update statistics
1045
            self.__amount_entries += 1;
1046
            
1047
        else:
1048
            search_node.entries[index_nearest_entry] = merged_entry;
1049
            search_node.feature += entry;
1050
        
1051
        return node_amount_updation;
1052
    
1053
    
1054
    def __insert_for_noneleaf_node(self, entry, search_node):
1055
        """!
1056
        @brief Recursive insert entry from none-leaf node to the tree.
1057
        
1058
        @param[in] entry (cfentry): Clustering feature.
1059
        @param[in] search_node (cfnode): None-leaf node from that insertion should be started.
1060
        
1061
        @return (bool) True if number of nodes at the below level is changed, otherwise False.
1062
        
1063
        """
1064
        
1065
        node_amount_updation = False;
1066
        
1067
        min_key = lambda child_node: child_node.get_distance(search_node, self.__type_measurement);
1068
        nearest_child_node = min(search_node.successors, key = min_key);
1069
        
1070
        child_node_updation = self.__recursive_insert(entry, nearest_child_node);
1071
        
1072
        # Update clustering feature of none-leaf node.
1073
        search_node.feature += entry;
1074
            
1075
        # Check branch factor, probably some leaf has been splitted and threshold has been exceeded.
1076
        if (len(search_node.successors) > self.__branch_factor):
1077
            
1078
            # Check if it's aleady root then new root should be created (height is increased in this case).
1079
            if (search_node is self.__root):
1080
                self.__root = non_leaf_node(search_node.feature, None, [ search_node ], None);
1081
                search_node.parent = self.__root;
1082
                
1083
                # Update statistics
1084
                self.__amount_nodes += 1;
1085
                self.__height += 1;
1086
                
1087
            [new_node1, new_node2] = self.__split_nonleaf_node(search_node);
1088
            
1089
            # Update parent list of successors
1090
            parent = search_node.parent;
1091
            parent.successors.remove(search_node);
1092
            parent.successors.append(new_node1);
1093
            parent.successors.append(new_node2);
1094
            
1095
            # Update statistics
1096
            self.__amount_nodes += 1;
1097
            node_amount_updation = True;
1098
            
1099
        elif (child_node_updation is True):
1100
            # Splitting has been finished, check for possibility to merge (at least we have already two children).
1101
            if (self.__merge_nearest_successors(search_node) is True):
1102
                self.__amount_nodes -= 1;
1103
        
1104
        return node_amount_updation;
1105
    
1106
    
1107
    def __merge_nearest_successors(self, node):
1108
        """!
1109
        @brief Find nearest sucessors and merge them.
1110
        
1111
        @param[in] node (non_leaf_node): Node whose two nearest successors should be merged.
1112
        
1113
        @return (bool): True if merging has been successfully performed, otherwise False.
1114
        
1115
        """
1116
        
1117
        merging_result = False;
1118
        
1119
        if (node.successors[0].type == cfnode_type.CFNODE_NONLEAF):
1120
            [nearest_child_node1, nearest_child_node2] = node.get_nearest_successors(self.__type_measurement);
1121
            
1122
            if (len(nearest_child_node1.successors) + len(nearest_child_node2.successors) <= self.__branch_factor):
1123
                node.successors.remove(nearest_child_node2);
1124
                if (nearest_child_node2.type == cfnode_type.CFNODE_LEAF):
1125
                    self.__leafes.remove(nearest_child_node2);
1126
                
1127
                nearest_child_node1.merge(nearest_child_node2);
1128
                
1129
                merging_result = True;
1130
        
1131
        return merging_result;
1132
    
1133
    
1134
    def __split_procedure(self, split_node):
1135
        """!
1136
        @brief Starts node splitting procedure in the CF-tree from the specify node.
1137
        
1138
        @param[in] split_node (cfnode): CF-tree node that should be splitted.
1139
        
1140
        """
1141
        if (split_node is self.__root):
1142
            self.__root = non_leaf_node(split_node.feature, None, [ split_node ], None);
1143
            split_node.parent = self.__root;
1144
            
1145
            # Update statistics
1146
            self.__amount_nodes += 1;
1147
            self.__height += 1;
1148
        
1149
        [new_node1, new_node2] = self.__split_leaf_node(split_node);
1150
        
1151
        self.__leafes.remove(split_node);
1152
        self.__leafes.append(new_node1);
1153
        self.__leafes.append(new_node2);
1154
        
1155
        # Update parent list of successors
1156
        parent = split_node.parent;
1157
        parent.successors.remove(split_node);
1158
        parent.successors.append(new_node1);
1159
        parent.successors.append(new_node2);
1160
        
1161
        # Update statistics
1162
        self.__amount_nodes += 1;
1163
    
1164
    
1165
    def __split_nonleaf_node(self, node):
1166
        """!
1167
        @brief Performs splitting of the specified non-leaf node.
1168
        
1169
        @param[in] node (non_leaf_node): Non-leaf node that should be splitted.
1170
        
1171
        @return (list) New pair of non-leaf nodes [non_leaf_node1, non_leaf_node2].
1172
        
1173
        """
1174
        
1175
        [farthest_node1, farthest_node2] = node.get_farthest_successors(self.__type_measurement);
1176
        
1177
        # create new non-leaf nodes
1178
        new_node1 = non_leaf_node(farthest_node1.feature, node.parent, [ farthest_node1 ], None);
1179
        new_node2 = non_leaf_node(farthest_node2.feature, node.parent, [ farthest_node2 ], None);
1180
        
1181
        farthest_node1.parent = new_node1;
1182
        farthest_node2.parent = new_node2;
1183
        
1184
        # re-insert other successors
1185
        for successor in node.successors:
1186
            if ( (successor is not farthest_node1) and (successor is not farthest_node2) ):
1187
                distance1 = new_node1.get_distance(successor, self.__type_measurement);
1188
                distance2 = new_node2.get_distance(successor, self.__type_measurement);
1189
                
1190
                if (distance1 < distance2):
1191
                    new_node1.insert_successor(successor);
1192
                else:
1193
                    new_node2.insert_successor(successor);
1194
        
1195
        return [new_node1, new_node2];
1196
    
1197
    
1198
    def __split_leaf_node(self, node):
1199
        """!
1200
        @brief Performs splitting of the specified leaf node.
1201
        
1202
        @param[in] node (leaf_node): Leaf node that should be splitted.
1203
        
1204
        @return (list) New pair of leaf nodes [leaf_node1, leaf_node2].
1205
        
1206
        @warning Splitted node is transformed to non_leaf.
1207
        
1208
        """
1209
        
1210
        # search farthest pair of entries
1211
        [farthest_entity1, farthest_entity2] = node.get_farthest_entries(self.__type_measurement);
1212
                    
1213
        # create new nodes
1214
        new_node1 = leaf_node(farthest_entity1, node.parent, [ farthest_entity1 ], None);
1215
        new_node2 = leaf_node(farthest_entity2, node.parent, [ farthest_entity2 ], None);
1216
        
1217
        # re-insert other entries
1218
        for entity in node.entries:
1219
            if ( (entity is not farthest_entity1) and (entity is not farthest_entity2) ):
1220
                distance1 = new_node1.feature.get_distance(entity, self.__type_measurement);
1221
                distance2 = new_node2.feature.get_distance(entity, self.__type_measurement);
1222
                
1223
                if (distance1 < distance2):
1224
                    new_node1.insert_entry(entity);
1225
                else:
1226
                    new_node2.insert_entry(entity);
1227
        
1228
        return [new_node1, new_node2];
1229
    
1230
    
1231
    def show_feature_destibution(self, data = None):
1232
        visualizer = cluster_visualizer();
1233
        
1234
        print("amount of nodes: ", self.__amount_nodes);
1235
        
1236
        if (data is not None):
1237
            visualizer.append_cluster(data, marker = 'x');
1238
        
1239
        for level in range(0, self.height):
1240
            level_nodes = self.get_level_nodes(level);
1241
            
1242
            centers = [ node.feature.get_centroid() for node in level_nodes ];
1243
            visualizer.append_cluster(centers, None, markersize = (self.height - level + 1) * 5);
1244
        
1245
        visualizer.show();