1
|
|
|
'''
|
2
|
|
|
|
3
|
|
|
Unit-tests for CF-Tree.
|
4
|
|
|
|
5
|
|
|
Copyright (C) 2015 Andrei Novikov ([email protected])
|
6
|
|
|
|
7
|
|
|
pyclustering is free software: you can redistribute it and/or modify
|
8
|
|
|
it under the terms of the GNU General Public License as published by
|
9
|
|
|
the Free Software Foundation, either version 3 of the License, or
|
10
|
|
|
(at your option) any later version.
|
11
|
|
|
|
12
|
|
|
pyclustering is distributed in the hope that it will be useful,
|
13
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
|
|
GNU General Public License for more details.
|
16
|
|
|
|
17
|
|
|
You should have received a copy of the GNU General Public License
|
18
|
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
|
|
|
20
|
|
|
'''
|
21
|
|
|
|
22
|
|
|
import unittest;
|
23
|
|
|
|
24
|
|
|
import numpy;
|
25
|
|
|
|
26
|
|
|
from pyclustering.container.cftree import cfentry, cftree;
|
27
|
|
|
from pyclustering.container.cftree import measurement_type;
|
28
|
|
|
|
29
|
|
|
from pyclustering.utils import linear_sum, square_sum;
|
30
|
|
|
from pyclustering.utils import euclidean_distance_sqrt, manhattan_distance, average_inter_cluster_distance, average_intra_cluster_distance, variance_increase_distance;
|
31
|
|
|
|
32
|
|
|
from random import random;
|
33
|
|
|
|
34
|
|
|
class Test(unittest.TestCase):
|
35
|
|
|
def templateCfClusterRepresentation(self, cluster, centroid, radius, diameter, tolerance):
|
36
|
|
|
entry = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster));
|
37
|
|
|
|
38
|
|
|
assertion_centroid = centroid;
|
39
|
|
|
if (type(centroid) != list):
|
40
|
|
|
assertion_centroid = [ centroid ];
|
41
|
|
|
|
42
|
|
|
if (type(centroid) == list):
|
43
|
|
|
for dimension in range(0, len(assertion_centroid)):
|
44
|
|
|
assert (assertion_centroid[dimension] - tolerance < ( entry.get_centroid() )[dimension]) and (( entry.get_centroid() )[dimension] < assertion_centroid[dimension] + tolerance);
|
45
|
|
|
|
46
|
|
|
assert (radius - tolerance < entry.get_radius()) and (entry.get_radius() < radius + tolerance);
|
47
|
|
|
assert (diameter - tolerance < entry.get_diameter()) and (entry.get_diameter() < diameter + tolerance);
|
48
|
|
|
|
49
|
|
|
def testCfClusterRepresentationOneDimension2(self):
|
50
|
|
|
cluster = [ [0.1], [0.2], [0.5], [0.4], [0.6] ];
|
51
|
|
|
self.templateCfClusterRepresentation(cluster, 0.36, 0.18547, 0.29326, 0.0001);
|
52
|
|
|
|
53
|
|
|
def testCfClusterRepresentationTwoDimension(self):
|
54
|
|
|
cluster = [ [0.1, 0.1], [0.2, 0.2], [0.5, 0.5], [0.4, 0.4], [0.6, 0.6] ];
|
55
|
|
|
self.templateCfClusterRepresentation(cluster, [0.36, 0.36], 0.26230, 0.41473, 0.0001);
|
56
|
|
|
|
57
|
|
|
|
58
|
|
|
def templateCfEntryValueDistance(self, cluster1, cluster2, value, tolerance, type_measurment):
|
59
|
|
|
entry1 = cfentry(len(cluster1), linear_sum(cluster1), square_sum(cluster1));
|
60
|
|
|
entry2 = cfentry(len(cluster2), linear_sum(cluster2), square_sum(cluster2));
|
61
|
|
|
|
62
|
|
|
distance = entry1.get_distance(entry2, type_measurment);
|
63
|
|
|
assert ( (value - tolerance < distance) and (value + tolerance > distance) );
|
64
|
|
|
|
65
|
|
|
def testCfEntryTwoPoints1(self):
|
66
|
|
|
self.templateCfEntryValueDistance([[0.0]], [[1.0]], 1.0, 0.0000001, measurement_type.CENTROID_EUCLIDIAN_DISTANCE);
|
67
|
|
|
|
68
|
|
|
def testCfEntryTwoPoints2(self):
|
69
|
|
|
self.templateCfEntryValueDistance([[0.0]], [[0.0]], 0.0, 0.0000001, measurement_type.CENTROID_EUCLIDIAN_DISTANCE);
|
70
|
|
|
|
71
|
|
|
def testCfEntryTwoPoints3(self):
|
72
|
|
|
self.templateCfEntryValueDistance([[-1.0]], [[0.0]], 1.0, 0.0000001, measurement_type.CENTROID_EUCLIDIAN_DISTANCE);
|
73
|
|
|
|
74
|
|
|
def testCfEntryTwoPoints4(self):
|
75
|
|
|
self.templateCfEntryValueDistance([[1.0, 0.0]], [[0.0, 1.0]], 2.0, 0.0000001, measurement_type.CENTROID_EUCLIDIAN_DISTANCE);
|
76
|
|
|
|
77
|
|
|
|
78
|
|
|
def testCfEntryIncrease(self):
|
79
|
|
|
tolerance = 0.00001;
|
80
|
|
|
cluster = [ [0.1, 0.1], [0.2, 0.2], [0.5, 0.5], [0.4, 0.4], [0.6, 0.6] ];
|
81
|
|
|
|
82
|
|
|
entry1 = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster));
|
83
|
|
|
entry2 = entry1 + entry1;
|
84
|
|
|
|
85
|
|
|
assert cfentry(10, [3.6, 3.6], 3.28) == entry2;
|
86
|
|
|
|
87
|
|
|
entry2 = entry2 + entry2;
|
88
|
|
|
assert cfentry(20, [7.2, 7.2], 6.56) == entry2;
|
89
|
|
|
|
90
|
|
|
def testCfEntryDecrease(self):
|
91
|
|
|
entry1 = cfentry(10, [3.6, 3.6], 3.28);
|
92
|
|
|
entry2 = cfentry(5, [1.8, 1.8], 1.64);
|
93
|
|
|
|
94
|
|
|
assert entry2 == (entry1 - entry2);
|
95
|
|
|
|
96
|
|
|
|
97
|
|
|
def templateCfEntryDistance(self, type_measurement):
|
98
|
|
|
cluster1 = [[0.1, 0.1], [0.1, 0.2], [0.2, 0.1], [0.2, 0.2]];
|
99
|
|
|
cluster2 = [[0.4, 0.4], [0.4, 0.5], [0.5, 0.4], [0.5, 0.5]];
|
100
|
|
|
cluster3 = [[0.9, 0.9], [0.9, 1.0], [1.0, 0.9], [1.0, 1.0]];
|
101
|
|
|
|
102
|
|
|
entry1 = cfentry(len(cluster1), linear_sum(cluster1), square_sum(cluster1));
|
103
|
|
|
entry2 = cfentry(len(cluster2), linear_sum(cluster2), square_sum(cluster2));
|
104
|
|
|
entry3 = cfentry(len(cluster3), linear_sum(cluster3), square_sum(cluster3));
|
105
|
|
|
|
106
|
|
|
distance12 = entry1.get_distance(entry2, type_measurement);
|
107
|
|
|
distance23 = entry2.get_distance(entry3, type_measurement);
|
108
|
|
|
distance13 = entry1.get_distance(entry3, type_measurement);
|
109
|
|
|
|
110
|
|
|
assert distance12 < distance23;
|
111
|
|
|
assert distance23 < distance13;
|
112
|
|
|
|
113
|
|
|
def testCfDistanceCentroidEuclidian(self):
|
114
|
|
|
self.templateCfEntryDistance(measurement_type.CENTROID_EUCLIDIAN_DISTANCE);
|
115
|
|
|
|
116
|
|
|
def testCfDistanceCentroidManhatten(self):
|
117
|
|
|
self.templateCfEntryDistance(measurement_type.CENTROID_MANHATTAN_DISTANCE);
|
118
|
|
|
|
119
|
|
|
def testCfDistanceAverageInterCluster(self):
|
120
|
|
|
self.templateCfEntryDistance(measurement_type.AVERAGE_INTER_CLUSTER_DISTANCE);
|
121
|
|
|
|
122
|
|
|
def testCfDistanceAverageIntraCluster(self):
|
123
|
|
|
self.templateCfEntryDistance(measurement_type.AVERAGE_INTRA_CLUSTER_DISTANCE);
|
124
|
|
|
|
125
|
|
|
def testCfDistanceVarianceIncrease(self):
|
126
|
|
|
self.templateCfEntryDistance(measurement_type.VARIANCE_INCREASE_DISTANCE);
|
127
|
|
|
|
128
|
|
|
|
129
|
|
|
def templateDistanceCalculation(self, cluster1, cluster2, type_measurement):
|
130
|
|
|
entry1 = cfentry(len(cluster1), linear_sum(cluster1), square_sum(cluster1));
|
131
|
|
|
entry2 = cfentry(len(cluster2), linear_sum(cluster2), square_sum(cluster2));
|
132
|
|
|
|
133
|
|
|
# check that the same distance from 1 to 2 and from 2 to 1.
|
134
|
|
|
distance12 = entry1.get_distance(entry2, type_measurement);
|
135
|
|
|
distance21 = entry2.get_distance(entry1, type_measurement);
|
136
|
|
|
|
137
|
|
|
assert distance12 == distance21;
|
138
|
|
|
|
139
|
|
|
# check with utils calculation
|
140
|
|
|
float_delta = 0.0000001;
|
141
|
|
|
if (type_measurement == measurement_type.CENTROID_EUCLIDIAN_DISTANCE):
|
142
|
|
|
assert distance12 == euclidean_distance_sqrt(entry1.get_centroid(), entry2.get_centroid());
|
143
|
|
|
|
144
|
|
|
elif (type_measurement == measurement_type.CENTROID_MANHATTAN_DISTANCE):
|
145
|
|
|
assert distance12 == manhattan_distance(entry1.get_centroid(), entry2.get_centroid());
|
146
|
|
|
|
147
|
|
|
elif (type_measurement == measurement_type.AVERAGE_INTER_CLUSTER_DISTANCE):
|
148
|
|
|
assert numpy.isclose(distance12, average_inter_cluster_distance(cluster1, cluster2)) == True;
|
149
|
|
|
|
150
|
|
|
elif (type_measurement == measurement_type.AVERAGE_INTRA_CLUSTER_DISTANCE):
|
151
|
|
|
assert numpy.isclose(distance12, average_intra_cluster_distance(cluster1, cluster2)) == True;
|
152
|
|
|
|
153
|
|
|
elif (type_measurement == measurement_type.VARIANCE_INCREASE_DISTANCE):
|
154
|
|
|
assert numpy.isclose(distance12, variance_increase_distance(cluster1, cluster2)) == True;
|
155
|
|
|
|
156
|
|
|
def templateDistanceCalculationTheSimplestSample(self, type_measurement):
|
157
|
|
|
cluster1 = [[0.1, 0.1], [0.1, 0.2], [0.2, 0.1], [0.2, 0.2]];
|
158
|
|
|
cluster2 = [[0.4, 0.4], [0.4, 0.5], [0.5, 0.4], [0.5, 0.5]];
|
159
|
|
|
|
160
|
|
|
self.templateDistanceCalculation(cluster1, cluster2, type_measurement);
|
161
|
|
|
|
162
|
|
|
def testDistanceCalculationTheSimplestSampleCentroidEuclidian(self):
|
163
|
|
|
self.templateDistanceCalculationTheSimplestSample(measurement_type.CENTROID_EUCLIDIAN_DISTANCE);
|
164
|
|
|
|
165
|
|
|
def testDistanceCalculationTheSimplestSampleCentroidManhattan(self):
|
166
|
|
|
self.templateDistanceCalculationTheSimplestSample(measurement_type.CENTROID_MANHATTAN_DISTANCE);
|
167
|
|
|
|
168
|
|
|
def testDistanceCalculationTheSimplestSampleAverageInterClusterDistance(self):
|
169
|
|
|
self.templateDistanceCalculationTheSimplestSample(measurement_type.AVERAGE_INTER_CLUSTER_DISTANCE);
|
170
|
|
|
|
171
|
|
|
def testDistanceCalculationTheSimplestSampleAverageIntraClusterDistance(self):
|
172
|
|
|
self.templateDistanceCalculationTheSimplestSample(measurement_type.AVERAGE_INTRA_CLUSTER_DISTANCE);
|
173
|
|
|
|
174
|
|
|
def testDistanceCalculationTheSimplestSampleVarianceIncreaseDistance(self):
|
175
|
|
|
self.templateDistanceCalculationTheSimplestSample(measurement_type.VARIANCE_INCREASE_DISTANCE);
|
176
|
|
|
|
177
|
|
|
|
178
|
|
|
def testCfTreeCreationWithOneEntry(self):
|
179
|
|
|
tree = cftree(2, 1, 1.0);
|
180
|
|
|
entry = cfentry(5, [0.0, 0.1], 0.05);
|
181
|
|
|
|
182
|
|
|
tree.insert(entry);
|
183
|
|
|
|
184
|
|
|
assert 1 == tree.amount_nodes;
|
185
|
|
|
assert 1 == tree.height;
|
186
|
|
|
assert 1 == tree.amount_entries;
|
187
|
|
|
|
188
|
|
|
assert entry == tree.root.feature;
|
189
|
|
|
assert None == tree.root.parent;
|
190
|
|
|
|
191
|
|
|
|
192
|
|
|
def testCfTreeCreationWithoutMerging(self):
|
193
|
|
|
clusters = [ [ [random() + j, random() + j] for i in range(10) ] for j in range(10) ];
|
194
|
|
|
tree = cftree(2, 1, 0.0);
|
195
|
|
|
|
196
|
|
|
for cluster in clusters:
|
197
|
|
|
tree.insert_cluster(cluster);
|
198
|
|
|
|
199
|
|
|
assert tree.height >= 4;
|
200
|
|
|
assert tree.amount_entries == 10;
|
201
|
|
|
assert len(tree.leafes) == 10;
|
202
|
|
|
|
203
|
|
|
def testCfTreeInserionOneLeafThreeEntries(self):
|
204
|
|
|
cluster1 = [[0.1, 0.1], [0.1, 0.2], [0.2, 0.1], [0.2, 0.2]];
|
205
|
|
|
cluster2 = [[0.4, 0.4], [0.4, 0.5], [0.5, 0.4], [0.5, 0.5]];
|
206
|
|
|
cluster3 = [[0.9, 0.9], [0.9, 1.0], [1.0, 0.9], [1.0, 1.0]];
|
207
|
|
|
|
208
|
|
|
tree = cftree(3, 4, 0.0);
|
209
|
|
|
tree.insert_cluster(cluster1);
|
210
|
|
|
tree.insert_cluster(cluster2);
|
211
|
|
|
tree.insert_cluster(cluster3);
|
212
|
|
|
|
213
|
|
|
entry1 = cfentry(len(cluster1), linear_sum(cluster1), square_sum(cluster1));
|
214
|
|
|
entry2 = cfentry(len(cluster2), linear_sum(cluster2), square_sum(cluster2));
|
215
|
|
|
entry3 = cfentry(len(cluster3), linear_sum(cluster3), square_sum(cluster3));
|
216
|
|
|
|
217
|
|
|
assert tree.find_nearest_leaf(entry1) == tree.find_nearest_leaf(entry2);
|
218
|
|
|
assert tree.find_nearest_leaf(entry2) == tree.find_nearest_leaf(entry3);
|
219
|
|
|
|
220
|
|
|
|
221
|
|
|
def templateCfTreeLeafIntegrity(self, number_clusters, branching_factor, max_entries, threshold):
|
222
|
|
|
clusters = [ [ [random() + j, random() + j] for i in range(10) ] for j in range(number_clusters) ];
|
223
|
|
|
tree = cftree(branching_factor, max_entries, threshold);
|
224
|
|
|
|
225
|
|
|
for index_cluster in range(0, len(clusters)):
|
226
|
|
|
tree.insert_cluster(clusters[index_cluster]);
|
227
|
|
|
|
228
|
|
|
result_searching = False;
|
229
|
|
|
for leaf in tree.leafes:
|
230
|
|
|
for node_entry in leaf.entries:
|
231
|
|
|
result_searching |= (node_entry == node_entry);
|
232
|
|
|
|
233
|
|
|
assert True == result_searching;
|
234
|
|
|
|
235
|
|
|
def testCfTreeLeafIntegrity10_2_1(self):
|
236
|
|
|
self.templateCfTreeLeafIntegrity(10, 2, 1, 0.0);
|
237
|
|
|
|
238
|
|
|
def testCfTreeLeafIntegrity10_3_1(self):
|
239
|
|
|
self.templateCfTreeLeafIntegrity(10, 3, 1, 0.0);
|
240
|
|
|
|
241
|
|
|
def testCfTreeLeafIntegrity20_4_1(self):
|
242
|
|
|
self.templateCfTreeLeafIntegrity(20, 4, 1, 0.0);
|
243
|
|
|
|
244
|
|
|
def testCfTreeLeafIntegrity20_4_2(self):
|
245
|
|
|
self.templateCfTreeLeafIntegrity(20, 4, 2, 0.0);
|
246
|
|
|
|
247
|
|
|
def testCfTreeLeafIntegrity40_10_5(self):
|
248
|
|
|
self.templateCfTreeLeafIntegrity(40, 10, 5, 0.0);
|
249
|
|
|
|
250
|
|
|
|
251
|
|
|
def testCfTreeEntryAbsorbing(self):
|
252
|
|
|
tree = cftree(2, 1, 10000.0);
|
253
|
|
|
absorbing_entry = cfentry(0, [0.0, 0.0], 0.0);
|
254
|
|
|
|
255
|
|
|
for offset in range(0, 10):
|
256
|
|
|
cluster = [ [random() + offset, random() + offset] for i in range(10)];
|
257
|
|
|
entry = cfentry(len(cluster), linear_sum(cluster), square_sum(cluster));
|
258
|
|
|
|
259
|
|
|
absorbing_entry += entry;
|
260
|
|
|
|
261
|
|
|
tree.insert(entry);
|
262
|
|
|
|
263
|
|
|
assert 1 == tree.amount_entries;
|
264
|
|
|
assert 1 == tree.amount_nodes;
|
265
|
|
|
assert 1 == tree.height;
|
266
|
|
|
|
267
|
|
|
assert None == tree.root.parent;
|
268
|
|
|
assert absorbing_entry == tree.root.feature;
|
269
|
|
|
|
270
|
|
|
|
271
|
|
|
def templateCfTreeTotalNumberPoints(self, number_points, dimension, branching_factor, number_entries, diameter):
|
272
|
|
|
tree = cftree(branching_factor, number_entries, diameter);
|
273
|
|
|
|
274
|
|
|
for index_point in range(0, number_points):
|
275
|
|
|
point = [ index_point for i in range(0, dimension) ];
|
276
|
|
|
|
277
|
|
|
tree.insert_cluster([ point ]);
|
278
|
|
|
|
279
|
|
|
number_points = 0;
|
280
|
|
|
for leaf in tree.leafes:
|
281
|
|
|
number_points += leaf.feature.number_points;
|
282
|
|
|
|
283
|
|
|
assert (index_point + 1) == number_points;
|
284
|
|
|
|
285
|
|
|
number_leaf_points = 0;
|
286
|
|
|
for leaf in tree.leafes:
|
287
|
|
|
number_leaf_points += leaf.feature.number_points;
|
288
|
|
|
|
289
|
|
|
assert number_points == tree.root.feature.number_points;
|
290
|
|
|
|
291
|
|
|
if (number_points != number_leaf_points):
|
292
|
|
|
print(number_points, number_leaf_points);
|
293
|
|
|
|
294
|
|
|
assert number_points == number_leaf_points;
|
295
|
|
|
|
296
|
|
|
def testCfTreeTotalNumberPoints10_1_5_5_NoDiameter(self):
|
297
|
|
|
self.templateCfTreeTotalNumberPoints(10, 1, 5, 5, 0.0);
|
298
|
|
|
|
299
|
|
|
def testCfTreeTotalNumberPoints10_1_5_5_WithDiameter(self):
|
300
|
|
|
self.templateCfTreeTotalNumberPoints(10, 1, 5, 5, 100.0);
|
301
|
|
|
|
302
|
|
|
def testCfTreeTotalNumberPoints10_1_5_5_WithSmallDiameter(self):
|
303
|
|
|
self.templateCfTreeTotalNumberPoints(10, 1, 5, 5, 2.5);
|
304
|
|
|
|
305
|
|
|
def testCfTreeTotalNumberPoints10_2_5_5_NoDiameter(self):
|
306
|
|
|
self.templateCfTreeTotalNumberPoints(10, 2, 5, 5, 0.0);
|
307
|
|
|
|
308
|
|
|
def testCfTreeTotalNumberPoints10_2_5_5_WithDiameter(self):
|
309
|
|
|
self.templateCfTreeTotalNumberPoints(10, 2, 5, 5, 100.0);
|
310
|
|
|
|
311
|
|
|
def testCfTreeTotalNumberPoints50_3_5_5_NoDiameter(self):
|
312
|
|
|
self.templateCfTreeTotalNumberPoints(50, 3, 5, 5, 0.0);
|
313
|
|
|
|
314
|
|
|
def testCfTreeTotalNumberPoints50_3_5_5_WithDiameter(self):
|
315
|
|
|
self.templateCfTreeTotalNumberPoints(50, 3, 5, 5, 100.0);
|
316
|
|
|
|
317
|
|
|
def testCfTreeTotalNumberPoints100_2_2_1_NoDiameter(self):
|
318
|
|
|
self.templateCfTreeTotalNumberPoints(100, 2, 2, 1, 0.0);
|
319
|
|
|
|
320
|
|
|
def testCfTreeTotalNumberPoints100_2_2_1_WithDiameter(self):
|
321
|
|
|
self.templateCfTreeTotalNumberPoints(100, 2, 2, 1, 100.0);
|
322
|
|
|
|
323
|
|
|
def testCfTreeTotalNumberPoints100_2_5_5_WithSmallDiameter(self):
|
324
|
|
|
self.templateCfTreeTotalNumberPoints(100, 2, 5, 5, 10.0);
|
325
|
|
|
|
326
|
|
|
|
327
|
|
|
if __name__ == "__main__":
|
328
|
|
|
unittest.main(); |