Completed
Push — master ( 93c3d2...b7ad63 )
by Andrei
04:01
created

cftree.show_feature_destibution()   B

Complexity

Conditions 4

Size

Total Lines 22

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
dl 0
loc 22
rs 8.9197
c 0
b 0
f 0
1
"""!
2
3
@brief Data Structure: CF-Tree
4
@details Based on book description:
5
         - M.Zhang, R.Ramakrishnan, M.Livny. BIRCH: An Efficient Data Clustering Method for Very Large Databases. 1996.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2017
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
from copy import copy;
29
30
from pyclustering.cluster import cluster_visualizer;
31
32
from pyclustering.utils import euclidean_distance_sqrt;
33
from pyclustering.utils import manhattan_distance;
34
from pyclustering.utils import list_math_addition, list_math_subtraction, list_math_multiplication;
35
from pyclustering.utils import linear_sum, square_sum;
36
37
from enum import IntEnum;
38
39
40
class measurement_type(IntEnum):
41
    """!
42
    @brief Enumeration of measurement types for CF-Tree.
43
    
44
    @see cftree
45
    
46
    """
47
    
48
    ## Euclidian distance between centroids of clustering features.
49
    CENTROID_EUCLIDIAN_DISTANCE     = 0;
50
    
51
    ## Manhattan distance between centroids of clustering features.
52
    CENTROID_MANHATTAN_DISTANCE     = 1;
53
    
54
    ## Average distance between all objects from clustering features.
55
    AVERAGE_INTER_CLUSTER_DISTANCE  = 2;
56
    
57
    ## Average distance between all objects within clustering features and between them.
58
    AVERAGE_INTRA_CLUSTER_DISTANCE  = 3;
59
    
60
    ## Variance based distance between clustering features.
61
    VARIANCE_INCREASE_DISTANCE      = 4;
62
63
64
class cfnode_type(IntEnum):
65
    """!
66
    @brief Enumeration of CF-Node types that are used by CF-Tree.
67
    
68
    @see cfnode
69
    @see cftree
70
    
71
    """
72
    
73
    ## Undefined node.
74
    CFNODE_DUMMY    = 0;
75
    
76
    ## Leaf node hasn't got successors, only entries.
77
    CFNODE_LEAF     = 1;
78
    
79
    ## Non-leaf node has got successors and hasn't got entries.
80
    CFNODE_NONLEAF  = 2;
81
82
83
class cfentry:
84
    """!
85
    @brief Clustering feature representation.
86
    
87
    @see cfnode
88
    @see cftree
89
    
90
    """
91
    
92
    @property
93
    def number_points(self):
94
        """!
95
        @brief Returns number of points that are encoded.
96
        
97
        @return (uint) Number of encoded points.
98
        
99
        """
100
        return self.__number_points;
101
    
102
    @property
103
    def linear_sum(self):
104
        """!
105
        @brief Returns linear sum.
106
        
107
        @return (list) Linear sum.
108
        
109
        """
110
        
111
        return self.__linear_sum;
112
    
113
    @property
114
    def square_sum(self):
115
        """!
116
        @brief Returns square sum.
117
        
118
        @return (double) Square sum.
119
        
120
        """
121
        return self.__square_sum;
122
    
123
    
124
    def __init__(self, number_points, linear_sum, square_sum):
0 ignored issues
show
Comprehensibility Bug introduced by
linear_sum is re-defining a name which is already available in the outer-scope (previously defined on line 35).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
Comprehensibility Bug introduced by
square_sum is re-defining a name which is already available in the outer-scope (previously defined on line 35).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
125
        """!
126
        @brief CF-entry constructor.
127
        
128
        @param[in] number_points (uint): Number of objects that is represented by the entry.
129
        @param[in] linear_sum (list): Linear sum of values that represent objects in each dimension.
130
        @param[in] square_sum (double): Square sum of values that represent objects.
131
        
132
        """
133
        
134
        self.__number_points = number_points;
135
        self.__linear_sum = linear_sum;
136
        self.__square_sum = square_sum;
137
        
138
        self.__centroid = None;
139
        self.__radius = None;
140
        self.__diameter = None;
141
    
142
    
143
    def __copy__(self):
144
        """!
145
        @returns (cfentry) Makes copy of the cfentry instance.
146
        
147
        """
148
        return cfentry(self.__number_points, self.__linear_sum, self.__square_sum);
149
    
150
    
151
    def __repr__(self):
152
        """!
153
        @return (string) Default cfentry representation.
154
        
155
        """
156
        return 'CF (%s, %s, %0.2f) [%s]' % ( self.number_points, self.linear_sum, self.__square_sum, hex(id(self)) );    
157
    
158
    
159
    def __str__(self):
160
        """!
161
        @brief Default cfentry string representation.
162
        
163
        """
164
        return self.__repr__();
165
    
166
    
167
    def __add__(self, entry):
168
        """!
169
        @brief Overloaded operator add. Performs addition of two clustering features.
170
        
171
        @param[in] entry (cfentry): Entry that is added to the current.
172
        
173
        @return (cfentry) Result of addition of two clustering features.
174
        
175
        """
176
        
177
        number_points = self.number_points + entry.number_points;
178
        result_linear_sum = list_math_addition(self.linear_sum, entry.linear_sum);
179
        result_square_sum = self.square_sum + entry.square_sum;  
180
        
181
        return cfentry(number_points, result_linear_sum, result_square_sum);
182
    
183
    
184
    def __sub__(self, entry):
185
        """!
186
        @brief Overloaded operator sub. Performs substraction of two clustering features.
187
        @details Substraction can't be performed with clustering feature whose description is less then substractor.
188
        
189
        @param[in] entry (cfentry): Entry that is substracted from the current.
190
        
191
        @return (cfentry) Result of substraction of two clustering features.
192
        
193
        """
194
                
195
        number_points = self.number_points - entry.number_points;
196
        result_linear_sum = list_math_subtraction(self.linear_sum, entry.linear_sum);
197
        result_square_sum = self.square_sum - entry.square_sum;
198
        
199
        if ( (number_points < 0) or (result_square_sum < 0) ):
200
            raise NameError("Substruction with negative result is not possible for clustering features.");
201
        
202
        return cfentry(number_points, result_linear_sum, result_square_sum);
203
    
204
    
205
    def __eq__(self, entry):
206
        """!
207
        @brief Overloaded operator eq. 
208
        @details Performs comparison of two clustering features.
209
        
210
        @param[in] entry (cfentry): Entry that is used for comparison with current.
211
        
212
        @return (bool) True is both clustering features are equals in line with tolerance, otherwise False.
213
        
214
        """
215
                
216
        tolerance = 0.00001;
217
        
218
        result = (self.__number_points == entry.number_points);
219
        result &= ( (self.square_sum + tolerance > entry.square_sum) and (self.square_sum - tolerance < entry.square_sum) );
220
        
221
        for index_dimension in range(0, len(self.linear_sum)):
222
            result &= ( (self.linear_sum[index_dimension] + tolerance > entry.linear_sum[index_dimension]) and (self.linear_sum[index_dimension] - tolerance < entry.linear_sum[index_dimension]) );
223
        
224
        return result;
225
    
226
    
227
    def get_distance(self, entry, type_measurement):
228
        """!
229
        @brief Calculates distance between two clusters in line with measurement type.
230
        
231
        @details In case of usage CENTROID_EUCLIDIAN_DISTANCE square euclidian distance will be returned.
232
                 Square root should be taken from the result for obtaining real euclidian distance between
233
                 entries. 
234
        
235
        @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
236
        @param[in] type_measurement (measurement_type): Distance measurement algorithm between two clusters.
237
        
238
        @return (double) Distance between two clusters.
239
        
240
        """
241
        
242
        if (type_measurement is measurement_type.CENTROID_EUCLIDIAN_DISTANCE):
243
            return euclidean_distance_sqrt(entry.get_centroid(), self.get_centroid());
244
        
245
        elif (type_measurement is measurement_type.CENTROID_MANHATTAN_DISTANCE):
246
            return manhattan_distance(entry.get_centroid(), self.get_centroid());
247
        
248
        elif (type_measurement is measurement_type.AVERAGE_INTER_CLUSTER_DISTANCE):
249
            return self.__get_average_inter_cluster_distance(entry);
250
            
251
        elif (type_measurement is measurement_type.AVERAGE_INTRA_CLUSTER_DISTANCE):
252
            return self.__get_average_intra_cluster_distance(entry);
253
        
254
        elif (type_measurement is measurement_type.VARIANCE_INCREASE_DISTANCE):
255
            return self.__get_variance_increase_distance(entry);
256
        
257
        else:
258
            assert 0;
259
    
260
        
261
    def get_centroid(self):
262
        """!
263
        @brief Calculates centroid of cluster that is represented by the entry. 
264
        @details It's calculated once when it's requested after the last changes.
265
        
266
        @return (list) Centroid of cluster that is represented by the entry.
267
        
268
        """
269
        
270
        if (self.__centroid is not None):
271
            return self.__centroid;
272
        
273
        self.__centroid = [0] * len(self.linear_sum);
274
        for index_dimension in range(0, len(self.linear_sum)):
275
            self.__centroid[index_dimension] = self.linear_sum[index_dimension] / self.number_points;
276
        
277
        return self.__centroid;
278
    
279
    
280
    def get_radius(self):
281
        """!
282
        @brief Calculates radius of cluster that is represented by the entry.
283
        @details It's calculated once when it's requested after the last changes.
284
        
285
        @return (double) Radius of cluster that is represented by the entry.
286
        
287
        """
288
        
289
        if (self.__radius is not None):
290
            return self.__radius;
291
        
292
        centroid = self.get_centroid();
293
        
294
        radius_part_1 = self.square_sum;
295
        
296
        radius_part_2 = 0.0;
297
        radius_part_3 = 0.0;
298
        
299
        if (type(centroid) == list):
300
            radius_part_2 = 2.0 * sum(list_math_multiplication(self.linear_sum, centroid));
301
            radius_part_3 = self.number_points * sum(list_math_multiplication(centroid, centroid));
302
        else:
303
            radius_part_2 = 2.0 * self.linear_sum * centroid;
304
            radius_part_3 = self.number_points * centroid * centroid;
305
        
306
        self.__radius = ( (1.0 / self.number_points) * (radius_part_1 - radius_part_2 + radius_part_3) ) ** 0.5;
307
        return self.__radius;
308
        
309
    
310
    def get_diameter(self):
311
        """!
312
        @brief Calculates diameter of cluster that is represented by the entry.
313
        @details It's calculated once when it's requested after the last changes.
314
        
315
        @return (double) Diameter of cluster that is represented by the entry.
316
        
317
        """
318
        
319
        if (self.__diameter is not None):
320
            return self.__diameter;
321
        
322
        diameter_part = 0.0;
323
        if (type(self.linear_sum) == list):
324
            diameter_part = self.square_sum * self.number_points - 2.0 * sum(list_math_multiplication(self.linear_sum, self.linear_sum)) + self.square_sum * self.number_points;
325
        else:
326
            diameter_part = self.square_sum * self.number_points - 2.0 * self.linear_sum * self.linear_sum + self.square_sum * self.number_points;
327
            
328
        self.__diameter = ( diameter_part / (self.number_points * (self.number_points - 1)) ) ** 0.5;
329
        return self.__diameter;
330
    
331
        
332
    def __get_average_inter_cluster_distance(self, entry):
333
        """!
334
        @brief Calculates average inter cluster distance between current and specified clusters.
335
        
336
        @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
337
        
338
        @return (double) Average inter cluster distance.
339
        
340
        """
341
        
342
        linear_part_distance = sum(list_math_multiplication(self.linear_sum, entry.linear_sum));
343
        
344
        return ( (entry.number_points * self.square_sum - 2.0 * linear_part_distance + self.number_points * entry.square_sum) / (self.number_points * entry.number_points) ) ** 0.5;
345
    
346
    
347
    def __get_average_intra_cluster_distance(self, entry):
348
        """!
349
        @brief Calculates average intra cluster distance between current and specified clusters.
350
        
351
        @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
352
        
353
        @return (double) Average intra cluster distance.
354
        
355
        """
356
        
357
        linear_part_first = list_math_addition(self.linear_sum, entry.linear_sum);
358
        linear_part_second = linear_part_first;
359
        
360
        linear_part_distance = sum(list_math_multiplication(linear_part_first, linear_part_second));
361
        
362
        general_part_distance = 2.0 * (self.number_points + entry.number_points) * (self.square_sum + entry.square_sum) - 2.0 * linear_part_distance;
363
        
364
        return (general_part_distance / ( (self.number_points + entry.number_points) * (self.number_points + entry.number_points - 1.0) )) ** 0.5;
365
    
366
    
367
    def __get_variance_increase_distance(self, entry):
368
        """!
369
        @brief Calculates variance increase distance between current and specified clusters.
370
        
371
        @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
372
        
373
        @return (double) Variance increase distance.
374
        
375
        """
376
                
377
        linear_part_12 = list_math_addition(self.linear_sum, entry.linear_sum);
378
        variance_part_first = (self.square_sum + entry.square_sum) - \
379
            2.0 * sum(list_math_multiplication(linear_part_12, linear_part_12)) / (self.number_points + entry.number_points) + \
380
            (self.number_points + entry.number_points) * sum(list_math_multiplication(linear_part_12, linear_part_12)) / (self.number_points + entry.number_points)**2.0;
381
382
        
383
        linear_part_11 = sum(list_math_multiplication(self.linear_sum, self.linear_sum));
384
        variance_part_second = -( self.square_sum - (2.0 * linear_part_11 / self.number_points) + (linear_part_11 / self.number_points) );
385
        
386
        linear_part_22 = sum(list_math_multiplication(entry.linear_sum, entry.linear_sum));
387
        variance_part_third = -( entry.square_sum - (2.0 / entry.number_points) * linear_part_22 + entry.number_points * (1.0 / entry.number_points ** 2.0) * linear_part_22 );
388
389
        return (variance_part_first + variance_part_second + variance_part_third);
390
        
391
392
class cfnode:
393
    """!
394
    @brief Representation of node of CF-Tree.
395
    
396
    """
397
    
398
    def __init__(self, feature, parent, payload):
399
        """!
400
        @brief Constructor of abstract CF node.
401
        
402
        @param[in] feature (cfentry): Clustering feature of the created node.
403
        @param[in] parent (cfnode): Parent of the created node.
404
        @param[in] payload (*): Data that is stored by the node.
405
        
406
        """
407
        
408
        ## Clustering feature of the node.
409
        self.feature = copy(feature);
410
        
411
        ## Pointer to the parent node (None for root).
412
        self.parent = parent;
413
        
414
        ## Type node (leaf or non-leaf).
415
        self.type = cfnode_type.CFNODE_DUMMY;
416
        
417
        ## Payload of node where user data can be stored.
418
        self.payload = payload;
419
    
420
    
421
    def __repr__(self):
422
        """!
423
        @return (string) Default representation of CF node.
424
        
425
        """
426
        
427
        return 'CF node %s, parent %s, feature %s' % ( hex(id(self)), self.parent, self.feature );
428
429
430
    def __str__(self):
431
        """!
432
        @return (string) String representation of CF node.
433
        
434
        """
435
        return self.__repr__();
436
    
437
    
438
    def get_distance(self, node, type_measurement):
439
        """!
440
        @brief Calculates distance between nodes in line with specified type measurement.
441
        
442
        @param[in] node (cfnode): CF-node that is used for calculation distance to the current node.
443
        @param[in] type_measurement (measurement_type): Measurement type that is used for calculation distance.
444
        
445
        @return (double) Distance between two nodes.
446
        
447
        """
448
        
449
        return self.feature.get_distance(node.feature, type_measurement);
450
    
451
452
class non_leaf_node(cfnode):
453
    """!
454
    @brief Representation of clustering feature non-leaf node.
455
    
456
    """ 
457
    
458
    @property
459
    def successors(self):
460
        """!
461
        @return (list) List of successors of the node.
462
        
463
        """
464
        return self.__successors;
465
    
466
    
467
    def __init__(self, feature, parent, successors, payload):
468
        """!
469
        @brief Create CF Non-leaf node.
470
        
471
        @param[in] feature (cfentry): Clustering feature of the created node.
472
        @param[in] parent (non_leaf_node): Parent of the created node.
473
        @param[in] successors (list): List of successors of the node.
474
        @param[in] payload (*): Data that is stored by the node.
475
        
476
        """
477
                
478
        super().__init__(feature, parent, payload);
479
        
480
        ## Node type in CF tree that is CFNODE_NONLEAF for non leaf node.
481
        self.type = cfnode_type.CFNODE_NONLEAF;
482
        
483
        self.__successors = successors;
484
    
485
    
486
    def __repr__(self):
487
        """!
488
        @return (string) Representation of non-leaf node representation.
489
        
490
        """   
491
        return 'Non-leaf node %s, parent %s, feature %s, successors: %d' % ( hex(id(self)), self.parent, self.feature, len(self.successors) );
492
    
493
    
494
    def __str__(self):
495
        """!
496
        @return (string) String non-leaf representation.
497
        
498
        """
499
        return self.__repr__();
500
    
501
    
502
    def insert_successor(self, successor):
503
        """!
504
        @brief Insert successor to the node.
505
        
506
        @param[in] successor (cfnode): Successor for adding.
507
        
508
        """
509
        
510
        self.feature += successor.feature;
511
        self.successors.append(successor);
512
        
513
        successor.parent = self;
514
    
515
    
516
    def remove_successor(self, successor):
517
        """!
518
        @brief Remove successor from the node.
519
        
520
        @param[in] successor (cfnode): Successor for removing.
521
        
522
        """
523
        
524
        self.feature -= successor.feature;
525
        self.successors.append(successor);
526
        
527
        successor.parent = self;
528
    
529
    
530
    def merge(self, node):
531
        """!
532
        @brief Merge non-leaf node to the current.
533
        
534
        @param[in] node (non_leaf_node): Non-leaf node that should be merged with current.
535
        
536
        """
537
                
538
        self.feature += node.feature;
539
        
540
        for child in node.successors:
541
            child.parent = self;
542
            self.successors.append(child);
543
    
544
    
545
    def get_farthest_successors(self, type_measurement):
546
        """!
547
        @brief Find pair of farthest successors of the node in line with measurement type.
548
        
549
        @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining farthest successors.
550
        
551
        @return (list) Pair of farthest successors represented by list [cfnode1, cfnode2].
552
        
553
        """
554
        
555
        farthest_node1 = None;
556
        farthest_node2 = None;
557
        farthest_distance = 0;
558
        
559
        for i in range(0, len(self.successors)):
560
            candidate1 = self.successors[i];
561
            
562
            for j in range(i + 1, len(self.successors)):
563
                candidate2 = self.successors[j];
564
                candidate_distance = candidate1.get_distance(candidate2, type_measurement);
565
                
566
                if (candidate_distance > farthest_distance):
567
                    farthest_distance = candidate_distance;
568
                    farthest_node1 = candidate1;
569
                    farthest_node2 = candidate2;        
570 View Code Duplication
        
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
571
                    return [farthest_node1, farthest_node2];
572
    
573
    
574
    def get_nearest_successors(self, type_measurement):
575
        """!
576
        @brief Find pair of nearest successors of the node in line with measurement type.
577
        
578
        @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest successors.
579
        
580
        @return (list) Pair of nearest successors represented by list.
581
        
582
        """
583
                
584
        nearest_node1 = None;
585
        nearest_node2 = None;
586
        nearest_distance = float("Inf");
587
        
588
        for i in range(0, len(self.successors)):
589
            candidate1 = self.successors[i];
590
            
591
            for j in range(i + 1, len(self.successors)):
592
                candidate2 = self.successors[j];
593
                candidate_distance = candidate1.get_distance(candidate2, type_measurement);
594
                
595
                if (candidate_distance < nearest_distance):
596
                    nearest_distance = candidate_distance;
597
                    nearest_node1 = candidate1;
598
                    nearest_node2 = candidate2;
599
        
600
        return [nearest_node1, nearest_node2];
601
602
603
class leaf_node(cfnode):
604
    """!
605
    @brief Represents clustering feature leaf node.
606
    
607
    """
608
    
609
    @property
610
    def entries(self):
611
        """!
612
        @return (list) List of leaf nodes.
613
        
614
        """
615
        return self.__entries;
616
    
617
    
618
    def __init__(self, feature, parent, entries, payload):
619
        """!
620
        @brief Create CF Leaf node.
621
        
622
        @param[in] feature (cfentry): Clustering feature of the created node.
623
        @param[in] parent (non_leaf_node): Parent of the created node.
624
        @param[in] entries (list): List of entries of the node.
625
        @param[in] payload (*): Data that is stored by the node.
626
        
627
        """
628
        
629
        super().__init__(feature, parent, payload);
630
        
631
        ## Node type in CF tree that is CFNODE_LEAF for leaf node.
632
        self.type = cfnode_type.CFNODE_LEAF;
633
        
634
        self.__entries = entries;   # list of clustering features
635
        
636
    
637
    def __repr__(self):
638
        """!
639
        @return (string) Default leaf node represenation.
640
        
641
        """
642
        text_entries = "\n";
643
        for entry in self.entries:
644
            text_entries += "\t" + str(entry) + "\n";
645
        
646
        return 'Leaf-node %s, parent %s, feature %s, entries: %d %s' % ( hex(id(self)), self.parent, self.feature, len(self.entries), text_entries );
647
    
648
    
649
    def __str__(self):
650
        """!
651
        @return (string) String leaf node representation.
652
        
653
        """
654
        return self.__repr__();
655
    
656
    
657
    def insert_entry(self, entry):
658
        """!
659
        @brief Insert new clustering feature to the leaf node.
660
        
661
        @param[in] entry (cfentry): Clustering feature.
662
        
663
        """
664
                              
665
        self.feature += entry;
666
        self.entries.append(entry);
667
        
668
    
669
    def remove_entry(self, entry):
670
        """!
671
        @brief Remove clustering feature from the leaf node.
672
        
673
        @param[in] entry (cfentry): Clustering feature.
674
        
675
        """
676
                
677
        self.feature -= entry;
678
        self.entries.remove(entry);
679
    
680
    
681
    def merge(self, node):
682
        """!
683
        @brief Merge leaf node to the current.
684
        
685
        @param[in] node (leaf_node): Leaf node that should be merged with current.
686
        
687
        """
688
        
689
        self.feature += node.feature;
690
        
691
        # Move entries from merged node
692
        for entry in node.entries:
693
            self.entries.append(entry);
694
            
695
    
696
    def get_farthest_entries(self, type_measurement):
697
        """!
698
        @brief Find pair of farthest entries of the node.
699
        
700
        @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining farthest entries.
701
        
702
        @return (list) Pair of farthest entries of the node that are represented by list.
703
        
704
        """
705
        
706
        farthest_entity1 = None;
707
        farthest_entity2 = None;
708
        farthest_distance = 0;
709
        
710
        for i in range(0, len(self.entries)):
711
            candidate1 = self.entries[i];
712
            
713
            for j in range(i + 1, len(self.entries)):
714
                candidate2 = self.entries[j];
715
                candidate_distance = candidate1.get_distance(candidate2, type_measurement);
716
                
717
                if (candidate_distance > farthest_distance):
718
                    farthest_distance = candidate_distance;
719
                    farthest_entity1 = candidate1;
720
                    farthest_entity2 = candidate2;
721
        
722
        return [farthest_entity1, farthest_entity2];
723
    
724
    
725
    def get_nearest_index_entry(self, entry, type_measurement):
726
        """!
727
        @brief Find nearest index of nearest entry of node for the specified entry.
728
        
729
        @param[in] entry (cfentry): Entry that is used for calculation distance.
730
        @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest entry to the specified.
731
        
732
        @return (uint) Index of nearest entry of node for the specified entry.
733
        
734
        """
735
        
736
        minimum_distance = float('Inf');
737
        nearest_index = 0;
738
        
739
        for candidate_index in range(0, len(self.entries)):
740
            candidate_distance = self.entries[candidate_index].get_distance(entry, type_measurement);
741
            if (candidate_distance < minimum_distance):
742
                nearest_index = candidate_index;
743
        
744
        return nearest_index;
745
    
746
    
747
    def get_nearest_entry(self, entry, type_measurement):
748
        """!
749
        @brief Find nearest entry of node for the specified entry.
750
        
751
        @param[in] entry (cfentry): Entry that is used for calculation distance.
752
        @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest entry to the specified.
753
        
754
        @return (cfentry) Nearest entry of node for the specified entry.
755
        
756
        """
757
        
758
        min_key = lambda cur_entity: cur_entity.get_distance(entry, type_measurement);
759
        return min(self.__entries, key = min_key);
760
761
762
class cftree:
763
    """!
764
    @brief CF-Tree representation.
765
    @details A CF-tree is a height-balanced tree with two parameters: branching factor and threshold.
766
    
767
    """
768
    
769
    @property
770
    def root(self):
771
        """!
772
        @return (cfnode) Root of the tree.
773
        
774
        """
775
        return self.__root;
776
    
777
    
778
    @property
779
    def leafes(self):
780
        """!
781
        @return (list) List of all non-leaf nodes in the tree.
782
        
783
        """
784
        return self.__leafes;
785
    
786
    
787
    @property
788
    def amount_nodes(self):
789
        """!
790
        @return (unit) Number of nodes (leaf and non-leaf) in the tree.
791
        
792
        """
793
        return self.__amount_nodes;
794
    
795
    
796
    @property
797
    def amount_entries(self):
798
        """!
799
        @return (uint) Number of entries in the tree.
800
        
801
        """
802
        return self.__amount_entries;
803
    
804
    
805
    @property
806
    def height(self):
807
        """!
808
        @return (uint) Height of the tree.
809
        
810
        """
811
        return self.__height;
812
    
813
    
814
    @property
815
    def branch_factor(self):
816
        """!
817
        @return (uint) Branching factor of the tree.
818
        @details Branching factor defines maximum number of successors in each non-leaf node.
819
        
820
        """
821
        return self.__branch_factor;
822
    
823
    
824
    @property
825
    def threshold(self):
826
        """!
827
        @return (double) Threshold of the tree that represents maximum diameter of sub-clusters that is formed by leaf node entries.
828
        
829
        """
830
        return self.__threshold;
831
    
832
    
833
    @property
834
    def max_entries(self):
835
        """!
836
        @return (uint) Maximum number of entries in each leaf node.
837
        
838
        """
839
        return self.__max_entries;
840
    
841
    
842
    @property
843
    def type_measurement(self):
844
        """!
845
        @return (measurement_type) Type that is used for measuring.
846
        
847
        """
848
        return self.__type_measurement;
849
    
850
    
851
    def __init__(self, branch_factor, max_entries, threshold, type_measurement = measurement_type.CENTROID_EUCLIDIAN_DISTANCE):
852
        """!
853
        @brief Create CF-tree.
854
        
855
        @param[in] branch_factor (uint): Maximum number of children for non-leaf nodes.
856
        @param[in] max_entries (uint): Maximum number of entries for leaf nodes.
857
        @param[in] threshold (double): Maximum diameter of feature clustering for each leaf node.
858
        @param[in] type_measurement (measurement_type): Measurement type that is used for calculation distance metrics.
859
        
860
        """
861
862
        self.__root = None;
863
864
        self.__branch_factor = branch_factor; # maximum number of children
865
        if (self.__branch_factor < 2):
866
            self.__branch_factor = 2;
867
        
868
        
869
        self.__threshold = threshold;         # maximum diameter of sub-clusters stored at the leaf nodes
870
        self.__max_entries = max_entries;
871
        
872
        self.__leafes = [];
873
        
874
        self.__type_measurement = type_measurement;
875
        
876
        # statistics
877
        self.__amount_nodes = 0;    # root, despite it can be None.
878
        self.__amount_entries = 0;
879
        self.__height = 0;          # tree size with root.
880
    
881
    
882
    def get_level_nodes(self, level):
883
        """!
884
        @brief Traverses CF-tree to obtain nodes at the specified level.
885
        
886
        @param[in] level (uint): CF-tree level from that nodes should be returned.
887
        
888
        @return (list) List of CF-nodes that are located on the specified level of the CF-tree.
889
        
890
        """
891
        
892
        level_nodes = [];
893
        if (level < self.__height):
894
            level_nodes = self.__recursive_get_level_nodes(level, self.__root);
895
        
896
        return level_nodes;
897
    
898
    
899
    def __recursive_get_level_nodes(self, level, node):
900
        """!
901
        @brief Traverses CF-tree to obtain nodes at the specified level recursively.
902
        
903
        @param[in] level (uint): Current CF-tree level.
904
        @param[in] node (cfnode): CF-node from that traversing is performed.
905
        
906
        @return (list) List of CF-nodes that are located on the specified level of the CF-tree.
907
        
908
        """
909
        
910
        level_nodes = [];
911
        if (level is 0):
912
            level_nodes.append(node);
913
        
914
        else:
915
            for sucessor in node.successors:
916
                level_nodes += self.__recursive_get_level_nodes(level - 1, sucessor);
917
        
918
        return level_nodes;
919
    
920
    
921
    def insert_cluster(self, cluster):
922
        """!
923
        @brief Insert cluster that is represented as list of points where each point is represented by list of coordinates.
924
        @details Clustering feature is created for that cluster and inserted to the tree.
925
        
926
        @param[in] cluster (list): Cluster that is represented by list of points that should be inserted to the tree.
927
        
928
        """
929
        
930
        entry = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster));
931
        self.insert(entry);
932
    
933
    
934
    def insert(self, entry):
935
        """!
936
        @brief Insert clustering feature to the tree.
937
        
938
        @param[in] entry (cfentry): Clustering feature that should be inserted.
939
        
940
        """
941
                
942
        if (self.__root is None):
943
            node = leaf_node(entry, None, [ entry ], None);
944
            
945
            self.__root = node;
946
            self.__leafes.append(node);
947
            
948
            # Update statistics
949
            self.__amount_entries += 1;
950
            self.__amount_nodes += 1;
951
            self.__height += 1;             # root has successor now
952
        else:
953
            child_node_updation = self.__recursive_insert(entry, self.__root);
954
            if (child_node_updation is True):
955
                # Splitting has been finished, check for possibility to merge (at least we have already two children).
956
                if (self.__merge_nearest_successors(self.__root) is True):
957
                    self.__amount_nodes -= 1;
958
    
959
    
960
    def find_nearest_leaf(self, entry, search_node = None):
961
        """!
962
        @brief Search nearest leaf to the specified clustering feature.
963
        
964
        @param[in] entry (cfentry): Clustering feature.
965
        @param[in] search_node (cfnode): Node from that searching should be started, if None then search process will be started for the root.
966
        
967
        @return (leaf_node) Nearest node to the specified clustering feature.
968
        
969
        """
970
        
971
        if (search_node is None):
972
            search_node = self.__root;
973
        
974
        nearest_node = search_node;
975
        
976
        if (search_node.type == cfnode_type.CFNODE_NONLEAF):
977
            min_key = lambda child_node: child_node.feature.get_distance(entry, self.__type_measurement);
978
            nearest_child_node = min(search_node.successors, key = min_key);
979
            
980
            nearest_node = self.find_nearest_leaf(entry, nearest_child_node);
981
        
982
        return nearest_node;
983
    
984
    
985
    def __recursive_insert(self, entry, search_node):
986
        """!
987
        @brief Recursive insert of the entry to the tree.
988
        @details It performs all required procedures during insertion such as splitting, merging.
989
        
990
        @param[in] entry (cfentry): Clustering feature.
991
        @param[in] search_node (cfnode): Node from that insertion should be started.
992
        
993
        @return (bool) True if number of nodes at the below level is changed, otherwise False.
994
        
995
        """
996
        
997
        # None-leaf node
998
        if (search_node.type == cfnode_type.CFNODE_NONLEAF):
999
            return self.__insert_for_noneleaf_node(entry, search_node);
1000
        
1001
        # Leaf is reached 
1002
        else:
1003
            return self.__insert_for_leaf_node(entry, search_node);
1004
    
1005
    
1006
    def __insert_for_leaf_node(self, entry, search_node):
1007
        """!
1008
        @brief Recursive insert entry from leaf node to the tree.
1009
        
1010
        @param[in] entry (cfentry): Clustering feature.
1011
        @param[in] search_node (cfnode): None-leaf node from that insertion should be started.
1012
        
1013
        @return (bool) True if number of nodes at the below level is changed, otherwise False.
1014
        
1015
        """
1016
        
1017
        node_amount_updation = False;
1018
        
1019
        # Try to absorb by the entity
1020
        index_nearest_entry = search_node.get_nearest_index_entry(entry, self.__type_measurement);
1021
        merged_entry = search_node.entries[index_nearest_entry] + entry;
1022
        
1023
        # Otherwise try to add new entry
1024
        if (merged_entry.get_diameter() > self.__threshold):
1025
            # If it's not exceeded append entity and update feature of the leaf node.
1026
            search_node.insert_entry(entry);
1027
            
1028
            # Otherwise current node should be splitted
1029
            if (len(search_node.entries) > self.__max_entries):
1030
                self.__split_procedure(search_node);
1031
                node_amount_updation = True;
1032
            
1033
            # Update statistics
1034
            self.__amount_entries += 1;
1035
            
1036
        else:
1037
            search_node.entries[index_nearest_entry] = merged_entry;
1038
            search_node.feature += entry;
1039
        
1040
        return node_amount_updation;
1041
    
1042
    
1043
    def __insert_for_noneleaf_node(self, entry, search_node):
1044
        """!
1045
        @brief Recursive insert entry from none-leaf node to the tree.
1046
        
1047
        @param[in] entry (cfentry): Clustering feature.
1048
        @param[in] search_node (cfnode): None-leaf node from that insertion should be started.
1049
        
1050
        @return (bool) True if number of nodes at the below level is changed, otherwise False.
1051
        
1052
        """
1053
        
1054
        node_amount_updation = False;
1055
        
1056
        min_key = lambda child_node: child_node.get_distance(search_node, self.__type_measurement);
1057
        nearest_child_node = min(search_node.successors, key = min_key);
1058
        
1059
        child_node_updation = self.__recursive_insert(entry, nearest_child_node);
1060
        
1061
        # Update clustering feature of none-leaf node.
1062
        search_node.feature += entry;
1063
            
1064
        # Check branch factor, probably some leaf has been splitted and threshold has been exceeded.
1065
        if (len(search_node.successors) > self.__branch_factor):
1066
            
1067
            # Check if it's aleady root then new root should be created (height is increased in this case).
1068
            if (search_node is self.__root):
1069
                self.__root = non_leaf_node(search_node.feature, None, [ search_node ], None);
1070
                search_node.parent = self.__root;
1071
                
1072
                # Update statistics
1073
                self.__amount_nodes += 1;
1074
                self.__height += 1;
1075
                
1076
            [new_node1, new_node2] = self.__split_nonleaf_node(search_node);
1077
            
1078
            # Update parent list of successors
1079
            parent = search_node.parent;
1080
            parent.successors.remove(search_node);
1081
            parent.successors.append(new_node1);
1082
            parent.successors.append(new_node2);
1083
            
1084
            # Update statistics
1085
            self.__amount_nodes += 1;
1086
            node_amount_updation = True;
1087
            
1088
        elif (child_node_updation is True):
1089
            # Splitting has been finished, check for possibility to merge (at least we have already two children).
1090
            if (self.__merge_nearest_successors(search_node) is True):
1091
                self.__amount_nodes -= 1;
1092
        
1093
        return node_amount_updation;
1094
    
1095
    
1096
    def __merge_nearest_successors(self, node):
1097
        """!
1098
        @brief Find nearest sucessors and merge them.
1099
        
1100
        @param[in] node (non_leaf_node): Node whose two nearest successors should be merged.
1101
        
1102
        @return (bool): True if merging has been successfully performed, otherwise False.
1103
        
1104
        """
1105
        
1106
        merging_result = False;
1107
        
1108
        if (node.successors[0].type == cfnode_type.CFNODE_NONLEAF):
1109
            [nearest_child_node1, nearest_child_node2] = node.get_nearest_successors(self.__type_measurement);
1110
            
1111
            if (len(nearest_child_node1.successors) + len(nearest_child_node2.successors) <= self.__branch_factor):
1112
                node.successors.remove(nearest_child_node2);
1113
                if (nearest_child_node2.type == cfnode_type.CFNODE_LEAF):
1114
                    self.__leafes.remove(nearest_child_node2);
1115
                
1116
                nearest_child_node1.merge(nearest_child_node2);
1117
                
1118
                merging_result = True;
1119
        
1120
        return merging_result;
1121
    
1122
    
1123
    def __split_procedure(self, split_node):
1124
        """!
1125
        @brief Starts node splitting procedure in the CF-tree from the specify node.
1126
        
1127
        @param[in] split_node (cfnode): CF-tree node that should be splitted.
1128
        
1129
        """
1130
        if (split_node is self.__root):
1131
            self.__root = non_leaf_node(split_node.feature, None, [ split_node ], None);
1132
            split_node.parent = self.__root;
1133
            
1134
            # Update statistics
1135
            self.__amount_nodes += 1;
1136
            self.__height += 1;
1137
        
1138
        [new_node1, new_node2] = self.__split_leaf_node(split_node);
1139
        
1140
        self.__leafes.remove(split_node);
1141
        self.__leafes.append(new_node1);
1142
        self.__leafes.append(new_node2);
1143
        
1144
        # Update parent list of successors
1145
        parent = split_node.parent;
1146
        parent.successors.remove(split_node);
1147
        parent.successors.append(new_node1);
1148
        parent.successors.append(new_node2);
1149
        
1150
        # Update statistics
1151
        self.__amount_nodes += 1;
1152
    
1153
    
1154
    def __split_nonleaf_node(self, node):
1155
        """!
1156
        @brief Performs splitting of the specified non-leaf node.
1157
        
1158
        @param[in] node (non_leaf_node): Non-leaf node that should be splitted.
1159
        
1160
        @return (list) New pair of non-leaf nodes [non_leaf_node1, non_leaf_node2].
1161
        
1162
        """
1163
        
1164
        [farthest_node1, farthest_node2] = node.get_farthest_successors(self.__type_measurement);
1165
        
1166
        # create new non-leaf nodes
1167
        new_node1 = non_leaf_node(farthest_node1.feature, node.parent, [ farthest_node1 ], None);
1168
        new_node2 = non_leaf_node(farthest_node2.feature, node.parent, [ farthest_node2 ], None);
1169
        
1170
        farthest_node1.parent = new_node1;
1171
        farthest_node2.parent = new_node2;
1172
        
1173
        # re-insert other successors
1174
        for successor in node.successors:
1175
            if ( (successor is not farthest_node1) and (successor is not farthest_node2) ):
1176
                distance1 = new_node1.get_distance(successor, self.__type_measurement);
1177
                distance2 = new_node2.get_distance(successor, self.__type_measurement);
1178
                
1179
                if (distance1 < distance2):
1180
                    new_node1.insert_successor(successor);
1181
                else:
1182
                    new_node2.insert_successor(successor);
1183
        
1184
        return [new_node1, new_node2];
1185
    
1186
    
1187
    def __split_leaf_node(self, node):
1188
        """!
1189
        @brief Performs splitting of the specified leaf node.
1190
        
1191
        @param[in] node (leaf_node): Leaf node that should be splitted.
1192
        
1193
        @return (list) New pair of leaf nodes [leaf_node1, leaf_node2].
1194
        
1195
        @warning Splitted node is transformed to non_leaf.
1196
        
1197
        """
1198
        
1199
        # search farthest pair of entries
1200
        [farthest_entity1, farthest_entity2] = node.get_farthest_entries(self.__type_measurement);
1201
                    
1202
        # create new nodes
1203
        new_node1 = leaf_node(farthest_entity1, node.parent, [ farthest_entity1 ], None);
1204
        new_node2 = leaf_node(farthest_entity2, node.parent, [ farthest_entity2 ], None);
1205
        
1206
        # re-insert other entries
1207
        for entity in node.entries:
1208
            if ( (entity is not farthest_entity1) and (entity is not farthest_entity2) ):
1209
                distance1 = new_node1.feature.get_distance(entity, self.__type_measurement);
1210
                distance2 = new_node2.feature.get_distance(entity, self.__type_measurement);
1211
                
1212
                if (distance1 < distance2):
1213
                    new_node1.insert_entry(entity);
1214
                else:
1215
                    new_node2.insert_entry(entity);
1216
        
1217
        return [new_node1, new_node2];
1218
    
1219
    
1220
    def show_feature_destibution(self, data = None):
1221
        """!
1222
        @brief Shows feature distribution.
1223
        @details Only features in 1D, 2D, 3D space can be visualized.
1224
        
1225
        @param[in] data (list): List of points that will be used for visualization, if it not specified than feature will be displayed only.
1226
        
1227
        """
1228
        visualizer = cluster_visualizer();
1229
        
1230
        print("amount of nodes: ", self.__amount_nodes);
1231
        
1232
        if (data is not None):
1233
            visualizer.append_cluster(data, marker = 'x');
1234
        
1235
        for level in range(0, self.height):
1236
            level_nodes = self.get_level_nodes(level);
1237
            
1238
            centers = [ node.feature.get_centroid() for node in level_nodes ];
1239
            visualizer.append_cluster(centers, None, markersize = (self.height - level + 1) * 5);
1240
        
1241
        visualizer.show();