Completed
Push — master ( ed6865...b2f733 )
by Andrei
01:40
created

cftree.__insert_for_leaf_node()   A

Complexity

Conditions 4

Size

Total Lines 56

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
dl 0
loc 56
rs 9.0544
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
"""!
2
3
@brief Data Structure: CF-Tree
4
@details Based on book description:
5
         - M.Zhang, R.Ramakrishnan, M.Livny. BIRCH: An Efficient Data Clustering Method for Very Large Databases. 1996.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2016
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
from copy import copy;
29
30
from pyclustering.utils import euclidean_distance_sqrt;
31
from pyclustering.utils import manhattan_distance;
32
from pyclustering.utils import list_math_addition, list_math_subtraction, list_math_multiplication;
33
from pyclustering.utils import linear_sum, square_sum;
34
35
from enum import IntEnum;
36
37
38
class measurement_type(IntEnum):
39
    """!
40
    @brief Enumeration of measurement types for CF-Tree.
41
    
42
    @see cftree
43
    
44
    """
45
    
46
    ## Euclidian distance between centroids of clustering features.
47
    CENTROID_EUCLIDIAN_DISTANCE     = 0;
48
    
49
    ## Manhattan distance between centroids of clustering features.
50
    
51
    CENTROID_MANHATTAN_DISTANCE     = 1;
52
    
53
    ## Average distance between all objects from clustering features.
54
    AVERAGE_INTER_CLUSTER_DISTANCE  = 2;
55
    
56
    ## Average distance between all objects within clustering features and between them.
57
    AVERAGE_INTRA_CLUSTER_DISTANCE  = 3;
58
    
59
    ## Variance based distance between clustering features.
60
    VARIANCE_INCREASE_DISTANCE      = 4;
61
    
62
63
class cfnode_type(IntEnum):
64
    """!
65
    @brief Enumeration of CF-Node types that are used by CF-Tree.
66
    
67
    @see cfnode
68
    @see cftree
69
    
70
    """
71
    
72
    ## Undefined node.
73
    CFNODE_DUMMY    = 0;
74
    
75
    ## Leaf node hasn't got successors, only entries.
76
    CFNODE_LEAF     = 1;
77
    
78
    ## Non-leaf node has got successors and hasn't got entries.
79
    CFNODE_NONLEAF  = 2;
80
    
81
82
class cfentry:
83
    """!
84
    @brief Clustering feature representation.
85
    
86
    @see cfnode
87
    @see cftree
88
    
89
    """
90
    
91
    @property
92
    def number_points(self):
93
        """!
94
        @brief Returns number of points that are encoded.
95
        
96
        @return (uint) Number of encoded points.
97
        
98
        """
99
        return self.__number_points;
100
    
101
    @property
102
    def linear_sum(self):
103
        """!
104
        @brief Returns linear sum.
105
        
106
        @return (list) Linear sum.
107
        
108
        """
109
        
110
        return self.__linear_sum;
111
    
112
    @property
113
    def square_sum(self):
114
        """!
115
        @brief Returns square sum.
116
        
117
        @return (double) Square sum.
118
        
119
        """
120
        return self.__square_sum;
121
    
122
    
123
    def __init__(self, number_points, linear_sum, square_sum):
0 ignored issues
show
Comprehensibility Bug introduced by
square_sum is re-defining a name which is already available in the outer-scope (previously defined on line 33).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
Comprehensibility Bug introduced by
linear_sum is re-defining a name which is already available in the outer-scope (previously defined on line 33).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
124
        """!
125
        @brief CF-entry constructor.
126
        
127
        @param[in] number_points (uint): Number of objects that is represented by the entry.
128
        @param[in] linear_sum (list): Linear sum of values that represent objects in each dimension.
129
        @param[in] square_sum (double): Square sum of values that represent objects.
130
        
131
        """
132
        
133
        self.__number_points = number_points;
134
        self.__linear_sum = linear_sum;
135
        self.__square_sum = square_sum;
136
        
137
        self.__centroid = None;
138
        self.__radius = None;
139
        self.__diameter = None;
140
    
141
    
142
    def __copy__(self):
143
        """!
144
        @returns (cfentry) Makes copy of the cfentry instance.
145
        
146
        """
147
        return cfentry(self.__number_points, self.__linear_sum, self.__square_sum);
148
    
149
    
150
    def __repr__(self):
151
        """!
152
        @return (string) Default cfentry representation.
153
        
154
        """
155
        return 'CF (%s, %s, %0.2f) [%s]' % ( self.number_points, self.linear_sum, self.__square_sum, hex(id(self)) );    
156
    
157
    
158
    def __str__(self):
159
        """!
160
        @brief Default cfentry string representation.
161
        
162
        """
163
        return self.__repr__();
164
    
165
    
166
    def __add__(self, entry):
167
        """!
168
        @brief Overloaded operator add. Performs addition of two clustering features.
169
        
170
        @param[in] entry (cfentry): Entry that is added to the current.
171
        
172
        @return (cfentry) Result of addition of two clustering features.
173
        
174
        """
175
        
176
        number_points = self.number_points + entry.number_points;
177
        result_linear_sum = list_math_addition(self.linear_sum, entry.linear_sum);
178
        result_square_sum = self.square_sum + entry.square_sum;  
179
        
180
        return cfentry(number_points, result_linear_sum, result_square_sum);
181
    
182
    
183
    def __sub__(self, entry):
184
        """!
185
        @brief Overloaded operator sub. Performs substraction of two clustering features.
186
        @details Substraction can't be performed with clustering feature whose description is less then substractor.
187
        
188
        @param[in] entry (cfentry): Entry that is substracted from the current.
189
        
190
        @return (cfentry) Result of substraction of two clustering features.
191
        
192
        """
193
                
194
        number_points = self.number_points - entry.number_points;
195
        result_linear_sum = list_math_subtraction(self.linear_sum, entry.linear_sum);
196
        result_square_sum = self.square_sum - entry.square_sum;
197
        
198
        if ( (number_points < 0) or (result_square_sum < 0) ):
199
            raise NameError("Substruction with negative result is not possible for clustering features.");
200
        
201
        return cfentry(number_points, result_linear_sum, result_square_sum);
202
    
203
    
204
    def __eq__(self, entry):
205
        """!
206
        @brief Overloaded operator eq. 
207
        @details Performs comparison of two clustering features.
208
        
209
        @param[in] entry (cfentry): Entry that is used for comparison with current.
210
        
211
        @return (bool) True is both clustering features are equals in line with tolerance, otherwise False.
212
        
213
        """
214
                
215
        tolerance = 0.00001;
216
        
217
        result = (self.number_points == entry.number_points);
218
        result &= ( (self.square_sum + tolerance > entry.square_sum) and (self.square_sum - tolerance < entry.square_sum) );
219
        
220
        for index_dimension in range(0, len(self.linear_sum)):
221
            result &= ( (self.linear_sum[index_dimension] + tolerance > entry.linear_sum[index_dimension]) and (self.linear_sum[index_dimension] - tolerance < entry.linear_sum[index_dimension]) );
222
        
223
        return result;
224
    
225
    
226
    def get_distance(self, entry, type_measurement):
227
        """!
228
        @brief Calculates distance between two clusters in line with measurement type.
229
        
230
        @details In case of usage CENTROID_EUCLIDIAN_DISTANCE square euclidian distance will be returned.
231
                 Square root should be taken from the result for obtaining real euclidian distance between
232
                 entries. 
233
        
234
        @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
235
        @param[in] type_measurement (measurement_type): Distance measurement algorithm between two clusters.
236
        
237
        @return (double) Distance between two clusters.
238
        
239
        """
240
        
241
        if (type_measurement is measurement_type.CENTROID_EUCLIDIAN_DISTANCE):
242
            return euclidean_distance_sqrt(entry.get_centroid(), self.get_centroid());
243
        
244
        elif (type_measurement is measurement_type.CENTROID_MANHATTAN_DISTANCE):
245
            return manhattan_distance(entry.get_centroid(), self.get_centroid());
246
        
247
        elif (type_measurement is measurement_type.AVERAGE_INTER_CLUSTER_DISTANCE):
248
            return self.__get_average_inter_cluster_distance(entry);
249
            
250
        elif (type_measurement is measurement_type.AVERAGE_INTRA_CLUSTER_DISTANCE):
251
            return self.__get_average_intra_cluster_distance(entry);
252
        
253
        elif (type_measurement is measurement_type.VARIANCE_INCREASE_DISTANCE):
254
            return self.__get_variance_increase_distance(entry);
255
        
256
        else:
257
            assert 0;
258
    
259
        
260
    def get_centroid(self):
261
        """!
262
        @brief Calculates centroid of cluster that is represented by the entry. 
263
        @details It's calculated once when it's requested after the last changes.
264
        
265
        @return (list) Centroid of cluster that is represented by the entry.
266
        
267
        """
268
        
269
        if (self.__centroid is not None):
270
            return self.__centroid;
271
        
272
        self.__centroid = [0] * len(self.linear_sum);
273
        for index_dimension in range(0, len(self.linear_sum)):
274
            self.__centroid[index_dimension] = self.linear_sum[index_dimension] / self.number_points;
275
        
276
        return self.__centroid;
277
    
278
    
279
    def get_radius(self):
280
        """!
281
        @brief Calculates radius of cluster that is represented by the entry.
282
        @details It's calculated once when it's requested after the last changes.
283
        
284
        @return (double) Radius of cluster that is represented by the entry.
285
        
286
        """
287
        
288
        if (self.__radius is not None):
289
            return self.__radius;
290
        
291
        centroid = self.get_centroid();
292
        
293
        radius_part_1 = self.square_sum;
294
        
295
        radius_part_2 = 0.0;
296
        radius_part_3 = 0.0;
297
        
298
        if (type(centroid) == list):
299
            radius_part_2 = 2.0 * sum(list_math_multiplication(self.linear_sum, centroid));
300
            radius_part_3 = self.number_points * sum(list_math_multiplication(centroid, centroid));
301
        else:
302
            radius_part_2 = 2.0 * self.linear_sum * centroid;
303
            radius_part_3 = self.number_points * centroid * centroid;
304
        
305
        self.__radius = ( (1.0 / self.number_points) * (radius_part_1 - radius_part_2 + radius_part_3) ) ** 0.5;
306
        return self.__radius;
307
        
308
    
309
    def get_diameter(self):
310
        """!
311
        @brief Calculates diameter of cluster that is represented by the entry.
312
        @details It's calculated once when it's requested after the last changes.
313
        
314
        @return (double) Diameter of cluster that is represented by the entry.
315
        
316
        """
317
        
318
        if (self.__diameter is not None):
319
            return self.__diameter;
320
        
321
        diameter_part = 0.0;
322
        if (type(self.linear_sum) == list):
323
            diameter_part = self.square_sum * self.number_points - 2.0 * sum(list_math_multiplication(self.linear_sum, self.linear_sum)) + self.square_sum * self.number_points;
324
        else:
325
            diameter_part = self.square_sum * self.number_points - 2.0 * self.linear_sum * self.linear_sum + self.square_sum * self.number_points;
326
            
327
        self.__diameter = ( diameter_part / (self.number_points * (self.number_points - 1)) ) ** 0.5;
328
        return self.__diameter;
329
    
330
        
331
    def __get_average_inter_cluster_distance(self, entry):
332
        """!
333
        @brief Calculates average inter cluster distance between current and specified clusters.
334
        
335
        @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
336
        
337
        @return (double) Average inter cluster distance.
338
        
339
        """
340
        
341
        linear_part_distance = sum(list_math_multiplication(self.linear_sum, entry.linear_sum));
342
        
343
        return ( (entry.number_points * self.square_sum - 2.0 * linear_part_distance + self.number_points * entry.square_sum) / (self.number_points * entry.number_points) ) ** 0.5;
344
    
345
    
346
    def __get_average_intra_cluster_distance(self, entry):
347
        """!
348
        @brief Calculates average intra cluster distance between current and specified clusters.
349
        
350
        @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
351
        
352
        @return (double) Average intra cluster distance.
353
        
354
        """
355
        
356
        linear_part_first = list_math_addition(self.linear_sum, entry.linear_sum);
357
        linear_part_second = linear_part_first;
358
        
359
        linear_part_distance = sum(list_math_multiplication(linear_part_first, linear_part_second));
360
        
361
        general_part_distance = 2.0 * (self.number_points + entry.number_points) * (self.square_sum + entry.square_sum) - 2.0 * linear_part_distance;
362
        
363
        return (general_part_distance / ( (self.number_points + entry.number_points) * (self.number_points + entry.number_points - 1.0) )) ** 0.5;
364
    
365
    
366
    def __get_variance_increase_distance(self, entry):
367
        """!
368
        @brief Calculates variance increase distance between current and specified clusters.
369
        
370
        @param[in] entry (cfentry): Clustering feature to which distance should be obtained.
371
        
372
        @return (double) Variance increase distance.
373
        
374
        """
375
                
376
        linear_part_12 = list_math_addition(self.linear_sum, entry.linear_sum);
377
        variance_part_first = (self.square_sum + entry.square_sum) - \
378
            2.0 * sum(list_math_multiplication(linear_part_12, linear_part_12)) / (self.number_points + entry.number_points) + \
379
            (self.number_points + entry.number_points) * sum(list_math_multiplication(linear_part_12, linear_part_12)) / (self.number_points + entry.number_points)**2.0;
380
381
        
382
        linear_part_11 = sum(list_math_multiplication(self.linear_sum, self.linear_sum));
383
        variance_part_second = -( self.square_sum - (2.0 * linear_part_11 / self.number_points) + (linear_part_11 / self.number_points) );
384
        
385
        linear_part_22 = sum(list_math_multiplication(entry.linear_sum, entry.linear_sum));
386
        variance_part_third = -( entry.square_sum - (2.0 / entry.number_points) * linear_part_22 + entry.number_points * (1.0 / entry.number_points ** 2.0) * linear_part_22 );
387
388
        return (variance_part_first + variance_part_second + variance_part_third);
389
        
390
391
class cfnode:
392
    """!
393
    @brief Representation of node of CF-Tree.
394
    
395
    """
396
    
397
    def __init__(self, feature, parent, payload):
398
        """!
399
        @brief Constructor of abstract CF node.
400
        
401
        @param[in] feature (cfentry): Clustering feature of the created node.
402
        @param[in] parent (cfnode): Parent of the created node.
403
        @param[in] payload (*): Data that is stored by the node.
404
        
405
        """
406
        
407
        ## Clustering feature of the node.
408
        self.feature = copy(feature);
409
        
410
        ## Pointer to the parent node (None for root).
411
        self.parent = parent;
412
        
413
        ## Type node (leaf or non-leaf).
414
        self.type = cfnode_type.CFNODE_DUMMY;
415
        
416
        ## Payload of node where user data can be stored.
417
        self.payload = payload;
418
        
419
    
420
    def __repr__(self):
421
        """!
422
        @return (string) Default representation of CF node.
423
        
424
        """
425
        
426
        return 'CF node %s, parent %s, feature %s' % ( hex(id(self)), self.parent, self.feature );
427
428
429
    def __str__(self):
430
        """!
431
        @return (string) String representation of CF node.
432
        
433
        """
434
        return self.__repr__();
435
    
436
    
437
    def get_distance(self, node, type_measurement):
438
        """!
439
        @brief Calculates distance between nodes in line with specified type measurement.
440
        
441
        @param[in] node (cfnode): CF-node that is used for calculation distance to the current node.
442
        @param[in] type_measurement (measurement_type): Measurement type that is used for calculation distance.
443
        
444
        @return (double) Distance between two nodes.
445
        
446
        """
447
        
448
        return self.feature.get_distance(node.feature, type_measurement);
449
    
450
451
class non_leaf_node(cfnode):
452
    """!
453
    @brief Representation of clustering feature non-leaf node.
454
    
455
    """ 
456
    
457
    @property
458
    def successors(self):
459
        """!
460
        @return (list) List of successors of the node.
461
        
462
        """
463
        return self.__successors;
464
    
465
    
466
    def __init__(self, feature, parent, successors, payload):
467
        """!
468
        @brief Create CF Non-leaf node.
469
        
470
        @param[in] feature (cfentry): Clustering feature of the created node.
471
        @param[in] parent (non_leaf_node): Parent of the created node.
472
        @param[in] successors (list): List of successors of the node.
473
        @param[in] payload (*): Data that is stored by the node.
474
        
475
        """
476
                
477
        super().__init__(feature, parent, payload);
478
        
479
        ## Node type in CF tree that is CFNODE_NONLEAF for non leaf node.
480
        self.type = cfnode_type.CFNODE_NONLEAF;
481
        
482
        self.__successors = successors;
483
    
484
    
485
    def __repr__(self):     
486
        """!
487
        @return (string) Representation of non-leaf node representation.
488
        
489
        """   
490
        return 'Non-leaf node %s, parent %s, feature %s, successors: %d' % ( hex(id(self)), self.parent, self.feature, len(self.successors) );
491
    
492
    
493
    def __str__(self):
494
        """!
495
        @return (string) String non-leaf representation.
496
        
497
        """
498
        return self.__repr__();
499
    
500
    
501
    def insert_successor(self, successor):
502
        """!
503
        @brief Insert successor to the node.
504
        
505
        @param[in] successor (cfnode): Successor for adding.
506
        
507
        """
508
        
509
        self.feature += successor.feature;
510
        self.successors.append(successor);
511
        
512
        successor.parent = self;
513
    
514
    
515
    def remove_successor(self, successor):
516
        """!
517
        @brief Remove successor from the node.
518
        
519
        @param[in] successor (cfnode): Successor for removing.
520
        
521
        """
522
        
523
        self.feature -= successor.feature;
524
        self.successors.append(successor);
525
        
526
        successor.parent = self;
527
    
528
    
529
    def merge(self, node):
530
        """!
531
        @brief Merge non-leaf node to the current.
532
        
533
        @param[in] node (non_leaf_node): Non-leaf node that should be merged with current.
534
        
535
        """
536
                
537
        self.feature += node.feature;
538
        
539
        for child in node.successors:
540
            child.parent = self;      
541
    
542
    
543
    def get_farthest_successors(self, type_measurement):
544
        """!
545
        @brief Find pair of farthest successors of the node in line with measurement type.
546
        
547
        @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining farthest successors.
548
        
549
        @return (list) Pair of farthest successors represented by list [cfnode1, cfnode2].
550
        
551
        """
552
        
553
        farthest_node1 = None;
554
        farthest_node2 = None;
555
        farthest_distance = 0;
556
        
557
        for i in range(0, len(self.successors)):
558
            candidate1 = self.successors[i];
559
            
560
            for j in range(i + 1, len(self.successors)):
561
                candidate2 = self.successors[j];
562
                candidate_distance = candidate1.get_distance(candidate2, type_measurement);
563
                
564
                if (candidate_distance > farthest_distance):
565
                    farthest_distance = candidate_distance;
566
                    farthest_node1 = candidate1;
567
                    farthest_node2 = candidate2;        
568
        
569
                    return [farthest_node1, farthest_node2];
570 View Code Duplication
    
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
571
    
572
    def get_nearest_successors(self, type_measurement):
573
        """!
574
        @brief Find pair of nearest successors of the node in line with measurement type.
575
        
576
        @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest successors.
577
        
578
        @return (list) Pair of nearest successors represented by list.
579
        
580
        """
581
                
582
        nearest_node1 = None;
583
        nearest_node2 = None;
584
        nearest_distance = float("Inf");
585
        
586
        for i in range(0, len(self.successors)):
587
            candidate1 = self.successors[i];
588
            
589
            for j in range(i + 1, len(self.successors)):
590
                candidate2 = self.successors[j];
591
                candidate_distance = candidate1.get_distance(candidate2, type_measurement);
592
                
593
                if (candidate_distance < nearest_distance):
594
                    nearest_distance = candidate_distance;
595
                    nearest_node1 = candidate1;
596
                    nearest_node2 = candidate2;        
597
        
598
        return [nearest_node1, nearest_node2];    
599
600
601
class leaf_node(cfnode):
602
    """!
603
    @brief Represents clustering feature leaf node.
604
    
605
    """
606
    
607
    @property
608
    def entries(self):
609
        """!
610
        @return (list) List of leaf nodes.
611
        
612
        """
613
        return self.__entries;
614
    
615
    
616
    def __init__(self, feature, parent, entries, payload):
617
        """!
618
        @brief Create CF Leaf node.
619
        
620
        @param[in] feature (cfentry): Clustering feature of the created node.
621
        @param[in] parent (non_leaf_node): Parent of the created node.
622
        @param[in] entries (list): List of entries of the node.
623
        @param[in] payload (*): Data that is stored by the node.
624
        
625
        """
626
        
627
        super().__init__(feature, parent, payload);
628
        
629
        ## Node type in CF tree that is CFNODE_LEAF for leaf node.
630
        self.type = cfnode_type.CFNODE_LEAF;
631
        
632
        self.__entries = entries;   # list of clustering features
633
        
634
    
635
    def __repr__(self):
636
        """!
637
        @return (string) Default leaf node represenation.
638
        
639
        """
640
        text_entries = "\n";
641
        for entry in self.entries:
642
            text_entries += "\t" + str(entry) + "\n";
643
        
644
        return 'Leaf-node %s, parent %s, feature %s, entries: %d %s' % ( hex(id(self)), self.parent, self.feature, len(self.entries), text_entries );
645
    
646
    
647
    def __str__(self):
648
        """!
649
        @return (string) String leaf node representation.
650
        
651
        """
652
        return self.__repr__();
653
    
654
    
655
    def insert_entry(self, entry):
656
        """!
657
        @brief Insert new clustering feature to the leaf node.
658
        
659
        @param[in] entry (cfentry): Clustering feature.
660
        
661
        """
662
                              
663
        self.feature += entry;
664
        self.entries.append(entry);
665
        
666
    
667
    def remove_entry(self, entry):
668
        """!
669
        @brief Remove clustering feature from the leaf node.
670
        
671
        @param[in] entry (cfentry): Clustering feature.
672
        
673
        """
674
                
675
        self.feature -= entry;
676
        self.entries.remove(entry);
677
    
678
    
679
    def merge(self, node):
680
        """!
681
        @brief Merge leaf node to the current.
682
        
683
        @param[in] node (leaf_node): Leaf node that should be merged with current.
684
        
685
        """
686
        
687
        self.feature += node.feature;
688
        
689
        # Move entries from merged node
690
        for entry in node.entries:
691
            self.entries.append(entry);
692
            
693
    
694
    def get_farthest_entries(self, type_measurement):
695
        """!
696
        @brief Find pair of farthest entries of the node.
697
        
698
        @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining farthest entries.
699
        
700
        @return (list) Pair of farthest entries of the node that are represented by list.
701
        
702
        """
703
        
704
        farthest_entity1 = None;
705
        farthest_entity2 = None;
706
        farthest_distance = 0;
707
        
708
        for i in range(0, len(self.entries)):
709
            candidate1 = self.entries[i];
710
            
711
            for j in range(i + 1, len(self.entries)):
712
                candidate2 = self.entries[j];
713
                candidate_distance = candidate1.get_distance(candidate2, type_measurement);
714
                
715
                if (candidate_distance > farthest_distance):
716
                    farthest_distance = candidate_distance;
717
                    farthest_entity1 = candidate1;
718
                    farthest_entity2 = candidate2;
719
        
720
        return [farthest_entity1, farthest_entity2];
721
    
722
    
723
    def get_nearest_index_entry(self, entry, type_measurement):
724
        """!
725
        @brief Find nearest index of nearest entry of node for the specified entry.
726
        
727
        @param[in] entry (cfentry): Entry that is used for calculation distance.
728
        @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest entry to the specified.
729
        
730
        @return (uint) Index of nearest entry of node for the specified entry.
731
        
732
        """
733
        
734
        minimum_distance = float('Inf');
735
        nearest_index = 0;
736
        
737
        for candidate_index in range(0, len(self.entries)):
738
            candidate_distance = self.entries[candidate_index].get_distance(entry, type_measurement);
739
            if (candidate_distance < minimum_distance):
740
                nearest_index = candidate_index;
741
                
742
        return nearest_index;
743
    
744
    
745
    def get_nearest_entry(self, entry, type_measurement):
746
        """!
747
        @brief Find nearest entry of node for the specified entry.
748
        
749
        @param[in] entry (cfentry): Entry that is used for calculation distance.
750
        @param[in] type_measurement (measurement_type): Measurement type that is used for obtaining nearest entry to the specified.
751
        
752
        @return (cfentry) Nearest entry of node for the specified entry.
753
        
754
        """
755
        
756
        min_key = lambda cur_entity: cur_entity.get_distance(entry, type_measurement);
757
        return min(self.__entries, key = min_key);
758
759
760
class cftree:
761
    """!
762
    @brief CF-Tree representation.
763
    @details A CF-tree is a height-balanced tree with two parameters: branching factor and threshold.
764
    
765
    """
766
    
767
    @property
768
    def root(self):
769
        """!
770
        @return (cfnode) Root of the tree.
771
        
772
        """
773
        return self.__root;
774
    
775
    
776
    @property
777
    def leafes(self):
778
        """!
779
        @return (list) List of all non-leaf nodes in the tree.
780
        
781
        """
782
        return self.__leafes;
783
    
784
    
785
    @property
786
    def amount_nodes(self):
787
        """!
788
        @return (unit) Number of nodes (leaf and non-leaf) in the tree.
789
        
790
        """
791
        return self.__amount_nodes;
792
    
793
    
794
    @property
795
    def amount_entries(self):
796
        """!
797
        @return (uint) Number of entries in the tree.
798
        
799
        """
800
        return self.__amount_entries;
801
    
802
    
803
    @property
804
    def height(self):
805
        """!
806
        @return (uint) Height of the tree.
807
        
808
        """
809
        return self.__height;
810
    
811
    
812
    @property
813
    def branch_factor(self):
814
        """!
815
        @return (uint) Branching factor of the tree.
816
        @details Branching factor defines maximum number of successors in each non-leaf node.
817
        
818
        """
819
        return self.__branch_factor;
820
    
821
    
822
    @property
823
    def threshold(self):
824
        """!
825
        @return (double) Threshold of the tree that represents maximum diameter of sub-clusters that is formed by leaf node entries.
826
        
827
        """
828
        return self.__threshold;
829
    
830
    
831
    @property
832
    def max_entries(self):
833
        """!
834
        @return (uint) Maximum number of entries in each leaf node.
835
        
836
        """
837
        return self.__max_entries;
838
    
839
    
840
    @property
841
    def type_measurement(self):
842
        """!
843
        @return (measurement_type) Type that is used for measuring.
844
        
845
        """
846
        return self.__type_measurement;
847
    
848
    
849
    def __init__(self, branch_factor, max_entries, threshold, type_measurement = measurement_type.CENTROID_EUCLIDIAN_DISTANCE):
850
        """!
851
        @brief Create CF-tree.
852
        
853
        @param[in] branch_factor (uint): Maximum number of children for non-leaf nodes.
854
        @param[in] max_entries (uint): Maximum number of entries for leaf nodes.
855
        @param[in] threshold (double): Maximum diameter of feature clustering for each leaf node.
856
        @param[in] type_measurement (measurement_type): Measurement type that is used for calculation distance metrics.
857
        
858
        """
859
860
        self.__root = None;
861
862
        self.__branch_factor = branch_factor; # maximum number of children
863
        if (self.__branch_factor < 2):
864
            self.__branch_factor = 2;
865
        
866
        
867
        self.__threshold = threshold;         # maximum diameter of sub-clusters stored at the leaf nodes
868
        self.__max_entries = max_entries;
869
        
870
        self.__leafes = [];
871
        
872
        self.__type_measurement = type_measurement;
873
        
874
        # statistics
875
        self.__amount_nodes = 0;    # root, despite it can be None.
876
        self.__amount_entries = 0;
877
        self.__height = 0;          # tree size with root.
878
    
879
    
880
    def insert_cluster(self, cluster):
881
        """!
882
        @brief Insert cluster that is represented as list of points where each point is represented by list of coordinates.
883
        @details Clustering feature is created for that cluster and inserted to the tree.
884
        
885
        @param[in] cluster (list): Cluster that is represented by list of points that should be inserted to the tree.
886
        
887
        """
888
        
889
        entry = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster));
890
        self.insert(entry);
891
        
892
        
893
    def insert(self, entry):
894
        """!
895
        @brief Insert clustering feature to the tree.
896
        
897
        @param[in] entry (cfentry): Clustering feature that should be inserted.
898
        
899
        """
900
                
901
        if (self.__root is None):
902
            node = leaf_node(entry, None, [ entry ], None);
903
            
904
            self.__root = node;
905
            self.__leafes.append(node);
906
            
907
            # Update statistics
908
            self.__amount_entries += 1;
909
            self.__amount_nodes += 1;
910
            self.__height += 1;             # root has successor now
911
        else:
912
            child_node_updation = self.__recursive_insert(entry, self.__root);
913
            if (child_node_updation is True):
914
                # Splitting has been finished, check for possibility to merge (at least we have already two children).
915
                if (self.__merge_nearest_successors(self.__root) is True):
916
                    self.__amount_nodes -= 1;
917
    
918
    
919
    def find_nearest_leaf(self, entry, search_node = None):
920
        """!
921
        @brief Search nearest leaf to the specified clustering feature.
922
        
923
        @param[in] entry (cfentry): Clustering feature.
924
        @param[in] search_node (cfnode): Node from that searching should be started, if None then search process will be started for the root.
925
        
926
        @return (leaf_node) Nearest node to the specified clustering feature.
927
        
928
        """
929
        
930
        if (search_node is None):
931
            search_node = self.__root;
932
        
933
        nearest_node = search_node;
934
        
935
        if (search_node.type == cfnode_type.CFNODE_NONLEAF):
936
            min_key = lambda child_node: child_node.feature.get_distance(entry, self.__type_measurement);
937
            nearest_child_node = min(search_node.successors, key = min_key);
938
            
939
            nearest_node = self.find_nearest_leaf(entry, nearest_child_node);
940
        
941
        return nearest_node;
942
    
943
    
944
    def __recursive_insert(self, entry, search_node):
945
        """!
946
        @brief Recursive insert of the entry to the tree.
947
        @details It performs all required procedures during insertion such as splitting, merging.
948
        
949
        @param[in] entry (cfentry): Clustering feature.
950
        @param[in] search_node (cfnode): Node from that insertion should be started.
951
        
952
        @return (bool) True if number of nodes at the below level is changed, otherwise False.
953
        
954
        """
955
        
956
        # None-leaf node
957
        if (search_node.type == cfnode_type.CFNODE_NONLEAF):
958
            return self.__insert_for_noneleaf_node(entry, search_node);
959
        
960
        # Leaf is reached 
961
        else:
962
            return self.__insert_for_leaf_node(entry, search_node);
963
    
964
    
965
    def __insert_for_leaf_node(self, entry, search_node):
966
        """!
967
        @brief Recursive insert entry from leaf node to the tree.
968
        
969
        @param[in] entry (cfentry): Clustering feature.
970
        @param[in] search_node (cfnode): None-leaf node from that insertion should be started.
971
        
972
        @return (bool) True if number of nodes at the below level is changed, otherwise False.
973
        
974
        """
975
        
976
        node_amount_updation = False;
977
        
978
        # Try to absorb by the entity
979
        index_nearest_entry = search_node.get_nearest_index_entry(entry, self.__type_measurement);
980
        merged_entry = search_node.entries[index_nearest_entry] + entry;
981
        
982
        # Otherwise try to add new entry
983
        if (merged_entry.get_diameter() > self.__threshold):
984
            # If it's not exceeded append entity and update feature of the leaf node.
985
            search_node.insert_entry(entry);
986
            
987
            # Otherwise current node should be splitted
988
            if (len(search_node.entries) > self.__max_entries):
989
                if (search_node is self.__root):
990
                    self.__root = non_leaf_node(search_node.feature, None, [ search_node ], None);
991
                    search_node.parent = self.__root;
992
                    
993
                    # Update statistics
994
                    self.__amount_nodes += 1;
995
                    self.__height += 1;
996
                
997
                [new_node1, new_node2] = self.__split_leaf_node(search_node);
998
                
999
                self.__leafes.remove(search_node);
1000
                self.__leafes.append(new_node1);
1001
                self.__leafes.append(new_node2);
1002
                
1003
                # Update parent list of successors
1004
                parent = search_node.parent;
1005
                parent.successors.remove(search_node);
1006
                parent.successors.append(new_node1);
1007
                parent.successors.append(new_node2);
1008
                
1009
                # Update statistics
1010
                self.__amount_nodes += 1;
1011
                node_amount_updation = True;
1012
            
1013
            # Update statistics
1014
            self.__amount_entries += 1;
1015
            
1016
        else:
1017
            search_node.entries[index_nearest_entry] = merged_entry;
1018
            search_node.feature += entry;
1019
        
1020
        return node_amount_updation;
1021
    
1022
    
1023
    def __insert_for_noneleaf_node(self, entry, search_node):
1024
        """!
1025
        @brief Recursive insert entry from none-leaf node to the tree.
1026
        
1027
        @param[in] entry (cfentry): Clustering feature.
1028
        @param[in] search_node (cfnode): None-leaf node from that insertion should be started.
1029
        
1030
        @return (bool) True if number of nodes at the below level is changed, otherwise False.
1031
        
1032
        """
1033
        
1034
        node_amount_updation = False;
1035
        
1036
        min_key = lambda child_node: child_node.get_distance(search_node, self.__type_measurement);
1037
        nearest_child_node = min(search_node.successors, key = min_key);
1038
        
1039
        child_node_updation = self.__recursive_insert(entry, nearest_child_node);
1040
        
1041
        # Update clustering feature of none-leaf node.
1042
        search_node.feature += entry;
1043
            
1044
        # Check branch factor, probably some leaf has been splitted and threshold has been exceeded.
1045
        if (len(search_node.successors) > self.__branch_factor):
1046
            
1047
            # Check if it's aleady root then new root should be created (height is increased in this case).
1048
            if (search_node is self.__root):
1049
                self.__root = non_leaf_node(search_node.feature, None, [ search_node ], None);
1050
                search_node.parent = self.__root;
1051
                
1052
                # Update statistics
1053
                self.__amount_nodes += 1;
1054
                self.__height += 1;
1055
                
1056
            [new_node1, new_node2] = self.__split_nonleaf_node(search_node);
1057
            
1058
            # Update parent list of successors
1059
            parent = search_node.parent;
1060
            parent.successors.remove(search_node);
1061
            parent.successors.append(new_node1);
1062
            parent.successors.append(new_node2);
1063
            
1064
            # Update statistics
1065
            self.__amount_nodes += 1;
1066
            node_amount_updation = True;
1067
            
1068
        elif (child_node_updation is True):
1069
            # Splitting has been finished, check for possibility to merge (at least we have already two children).
1070
            if (self.__merge_nearest_successors(search_node) is True):
1071
                self.__amount_nodes -= 1;
1072
        
1073
        return node_amount_updation;
1074
    
1075
    
1076
    def __merge_nearest_successors(self, node):
1077
        """!
1078
        @brief Find nearest sucessors and merge them.
1079
        
1080
        @param[in] node (non_leaf_node): Node whose two nearest successors should be merged.
1081
        
1082
        @return (bool): True if merging has been successfully performed, otherwise False.
1083
        
1084
        """
1085
        
1086
        merging_result = False;
1087
        
1088
        if (len(node.successors) < self.__branch_factor):
1089
            if (node.successors[0].type == cfnode_type.CFNODE_NONLEAF):
1090
                [nearest_child_node1, nearest_child_node2] = node.get_nearest_successors(self.__type_measurement);
1091
                
1092
                node.successors.remove(nearest_child_node2);
1093
                if (nearest_child_node2.type == cfnode_type.CFNODE_LEAF):
1094
                    self.__leafes.remove(nearest_child_node2);
1095
                
1096
                nearest_child_node1.merge(nearest_child_node2);
1097
                
1098
                merging_result = True;
1099
        
1100
        return merging_result;
1101
            
1102
    
1103
    def __split_nonleaf_node(self, node):
1104
        """!
1105
        @brief Performs splitting of the specified non-leaf node.
1106
        
1107
        @param[in] node (non_leaf_node): Non-leaf node that should be splitted.
1108
        
1109
        @return (list) New pair of non-leaf nodes [non_leaf_node1, non_leaf_node2].
1110
        
1111
        """
1112
        
1113
        [farthest_node1, farthest_node2] = node.get_farthest_successors(self.__type_measurement);
1114
        
1115
        # create new non-leaf nodes
1116
        new_node1 = non_leaf_node(farthest_node1.feature, node.parent, [ farthest_node1 ], None);
1117
        new_node2 = non_leaf_node(farthest_node2.feature, node.parent, [ farthest_node2 ], None);
1118
        
1119
        farthest_node1.parent = new_node1;
1120
        farthest_node2.parent = new_node2;
1121
        
1122
        # re-insert other successors
1123
        for successor in node.successors:
1124
            if ( (successor is not farthest_node1) and (successor is not farthest_node2) ):
1125
                distance1 = new_node1.get_distance(successor, self.__type_measurement);
1126
                distance2 = new_node2.get_distance(successor, self.__type_measurement);
1127
                
1128
                if (distance1 < distance2):
1129
                    new_node1.insert_successor(successor);
1130
                else:
1131
                    new_node2.insert_successor(successor);
1132
        
1133
        return [new_node1, new_node2];
1134
    
1135
    
1136
    def __split_leaf_node(self, node):
1137
        """!
1138
        @brief Performs splitting of the specified leaf node.
1139
        
1140
        @param[in] node (leaf_node): Leaf node that should be splitted.
1141
        
1142
        @return (list) New pair of leaf nodes [leaf_node1, leaf_node2].
1143
        
1144
        @warning Splitted node is transformed to non_leaf.
1145
        
1146
        """
1147
        
1148
        # search farthest pair of entries
1149
        [farthest_entity1, farthest_entity2] = node.get_farthest_entries(self.__type_measurement);
1150
                    
1151
        # create new nodes
1152
        new_node1 = leaf_node(farthest_entity1, node.parent, [ farthest_entity1 ], None);
1153
        new_node2 = leaf_node(farthest_entity2, node.parent, [ farthest_entity2 ], None);
1154
        
1155
        # re-insert other entries
1156
        for entity in node.entries:
1157
            if ( (entity is not farthest_entity1) and (entity is not farthest_entity2) ):
1158
                distance1 = new_node1.feature.get_distance(entity, self.__type_measurement);
1159
                distance2 = new_node2.feature.get_distance(entity, self.__type_measurement);
1160
                
1161
                if (distance1 < distance2):
1162
                    new_node1.insert_entry(entity);
1163
                else:
1164
                    new_node2.insert_entry(entity);
1165
        
1166
        return [new_node1, new_node2];
1167