1
|
|
|
from collections import namedtuple, deque |
2
|
|
|
from operator import attrgetter |
3
|
|
|
from itertools import chain, count |
4
|
|
|
import heapq |
5
|
|
|
import numpy |
|
|
|
|
6
|
|
|
|
7
|
|
|
import scipy.cluster.hierarchy |
|
|
|
|
8
|
|
|
from Orange.distance import Euclidean, PearsonR |
9
|
|
|
|
10
|
|
|
__all__ = ['HierarchicalClustering'] |
11
|
|
|
|
12
|
|
|
SINGLE = "single" |
13
|
|
|
AVERAGE = "average" |
14
|
|
|
COMPLETE = "complete" |
15
|
|
|
WEIGHTED = "weighted" |
16
|
|
|
WARD = "ward" |
17
|
|
|
|
18
|
|
|
|
19
|
|
|
def condensedform(X, mode="upper"): |
20
|
|
|
X = numpy.asarray(X) |
21
|
|
|
assert len(X.shape) == 2 |
22
|
|
|
assert X.shape[0] == X.shape[1] |
23
|
|
|
|
24
|
|
|
N = X.shape[0] |
25
|
|
|
|
26
|
|
|
if mode == "upper": |
27
|
|
|
i, j = numpy.triu_indices(N, k=1) |
28
|
|
|
elif mode == "lower": |
29
|
|
|
i, j = numpy.tril_indices(N, k=-1) |
30
|
|
|
else: |
31
|
|
|
raise ValueError("invalid mode") |
32
|
|
|
return X[i, j] |
33
|
|
|
|
34
|
|
|
|
35
|
|
|
def squareform(X, mode="upper"): |
36
|
|
|
X = numpy.asarray(X) |
37
|
|
|
k = X.shape[0] |
38
|
|
|
N = int(numpy.ceil(numpy.sqrt(k * 2))) |
39
|
|
|
assert N * (N - 1) // 2 == k |
40
|
|
|
matrix = numpy.zeros((N, N), dtype=X.dtype) |
41
|
|
|
if mode == "upper": |
42
|
|
|
i, j = numpy.triu_indices(N, k=1) |
43
|
|
|
matrix[i, j] = X |
44
|
|
|
m, n = numpy.tril_indices(N, k=-1) |
45
|
|
|
matrix[m, n] = matrix.T[m, n] |
46
|
|
|
elif mode == "lower": |
47
|
|
|
i, j = numpy.tril_indices(N, k=-1) |
48
|
|
|
matrix[i, j] = X |
49
|
|
|
m, n = numpy.triu_indices(N, k=1) |
50
|
|
|
matrix[m, n] = matrix.T[m, n] |
51
|
|
|
return matrix |
52
|
|
|
|
53
|
|
|
|
54
|
|
|
def data_clustering(data, distance=Euclidean, |
55
|
|
|
linkage=AVERAGE): |
56
|
|
|
""" |
57
|
|
|
Return the hierarchical clustering of the data set's rows. |
58
|
|
|
|
59
|
|
|
:param Orange.data.Table data: Data set to cluster. |
60
|
|
|
:param Orange.distance.Distance distance: A distance measure. |
61
|
|
|
:param str linkage: |
62
|
|
|
""" |
63
|
|
|
matrix = distance(data) |
64
|
|
|
return dist_matrix_clustering(matrix, linkage=linkage) |
65
|
|
|
|
66
|
|
|
|
67
|
|
|
def feature_clustering(data, distance=PearsonR, |
68
|
|
|
linkage=AVERAGE): |
69
|
|
|
""" |
70
|
|
|
Return the hierarchical clustering of the data set's columns. |
71
|
|
|
|
72
|
|
|
:param Orange.data.Table data: Data set to cluster. |
73
|
|
|
:param Orange.distance.Distance distance: A distance measure. |
74
|
|
|
:param str linkage: |
75
|
|
|
""" |
76
|
|
|
matrix = distance(data, axis=0) |
77
|
|
|
return dist_matrix_clustering(matrix, linkage=linkage) |
78
|
|
|
|
79
|
|
|
|
80
|
|
|
|
81
|
|
|
def dist_matrix_linkage(matrix, linkage=AVERAGE): |
82
|
|
|
""" |
83
|
|
|
Return linkage using a precomputed distance matrix. |
84
|
|
|
|
85
|
|
|
:param Orange.misc.DistMatrix matrix: |
86
|
|
|
:param str linkage: |
87
|
|
|
""" |
88
|
|
|
# Extract compressed upper triangular distance matrix. |
89
|
|
|
distances = condensedform(matrix) |
90
|
|
|
return scipy.cluster.hierarchy.linkage(distances, method=linkage) |
91
|
|
|
|
92
|
|
|
|
93
|
|
|
def dist_matrix_clustering(matrix, linkage=AVERAGE): |
94
|
|
|
""" |
95
|
|
|
Return the hierarchical clustering using a precomputed distance matrix. |
96
|
|
|
|
97
|
|
|
:param Orange.misc.DistMatrix matrix: |
98
|
|
|
:param str linkage: |
99
|
|
|
""" |
100
|
|
|
# Extract compressed upper triangular distance matrix. |
101
|
|
|
distances = condensedform(matrix) |
102
|
|
|
Z = scipy.cluster.hierarchy.linkage(distances, method=linkage) |
103
|
|
|
return tree_from_linkage(Z) |
104
|
|
|
|
105
|
|
|
|
106
|
|
|
def sample_clustering(X, linkage=AVERAGE, metric="euclidean"): |
107
|
|
|
assert len(X.shape) == 2 |
108
|
|
|
Z = scipy.cluster.hierarchy.linkage(X, method=linkage, metric=metric) |
109
|
|
|
return tree_from_linkage(Z) |
110
|
|
|
|
111
|
|
|
|
112
|
|
|
Tree = namedtuple("Tree", ["value", "branches"]) |
113
|
|
|
|
114
|
|
|
ClusterData = namedtuple("Cluster", ["range", "height"]) |
115
|
|
|
SingletonData = namedtuple("Singleton", ["range", "height", "index"]) |
116
|
|
|
|
117
|
|
|
|
118
|
|
|
class _Ranged: |
119
|
|
|
|
120
|
|
|
@property |
121
|
|
|
def first(self): |
122
|
|
|
return self.range[0] |
123
|
|
|
|
124
|
|
|
@property |
125
|
|
|
def last(self): |
126
|
|
|
return self.range[-1] |
127
|
|
|
|
128
|
|
|
|
129
|
|
|
class ClusterData(ClusterData, _Ranged): |
|
|
|
|
130
|
|
|
__slots__ = () |
131
|
|
|
|
132
|
|
|
|
133
|
|
|
class SingletonData(SingletonData, _Ranged): |
|
|
|
|
134
|
|
|
__slots__ = () |
135
|
|
|
|
136
|
|
|
|
137
|
|
|
class Tree(Tree): |
|
|
|
|
138
|
|
|
def __new__(cls, value, branches=()): |
139
|
|
|
if not isinstance(branches, tuple): |
140
|
|
|
raise TypeError() |
141
|
|
|
|
142
|
|
|
return super().__new__(cls, value, branches) |
143
|
|
|
|
144
|
|
|
@property |
145
|
|
|
def is_leaf(self): |
146
|
|
|
return not bool(self.branches) |
147
|
|
|
|
148
|
|
|
@property |
149
|
|
|
def left(self): |
150
|
|
|
return self.branches[0] |
151
|
|
|
|
152
|
|
|
@property |
153
|
|
|
def right(self): |
154
|
|
|
return self.branches[-1] |
155
|
|
|
|
156
|
|
|
|
157
|
|
|
def tree_from_linkage(linkage): |
158
|
|
|
""" |
159
|
|
|
Return a Tree representation of a clustering encoded in a linkage matrix. |
160
|
|
|
|
161
|
|
|
.. seealso:: scipy.cluster.hierarchy.linkage |
162
|
|
|
|
163
|
|
|
""" |
164
|
|
|
T = {} |
165
|
|
|
N, _ = linkage.shape |
166
|
|
|
N = N + 1 |
167
|
|
|
order = [] |
168
|
|
|
for i, (c1, c2, d, _) in enumerate(linkage): |
169
|
|
|
if c1 < N: |
170
|
|
|
left = Tree(SingletonData(range=(len(order), len(order) + 1), |
171
|
|
|
height=0.0, index=int(c1)), |
172
|
|
|
()) |
173
|
|
|
order.append(c1) |
174
|
|
|
else: |
175
|
|
|
left = T[c1] |
176
|
|
|
|
177
|
|
|
if c2 < N: |
178
|
|
|
right = Tree(SingletonData(range=(len(order), len(order) + 1), |
179
|
|
|
height=0.0, index=int(c2)), |
180
|
|
|
()) |
181
|
|
|
order.append(c2) |
182
|
|
|
else: |
183
|
|
|
right = T[c2] |
184
|
|
|
|
185
|
|
|
t = Tree(ClusterData(range=(left.value.first, right.value.last), |
186
|
|
|
height=d), |
187
|
|
|
(left, right)) |
188
|
|
|
T[N + i] = t |
189
|
|
|
|
190
|
|
|
root = T[N + N - 2] |
191
|
|
|
T = {} |
192
|
|
|
|
193
|
|
|
leaf_idx = 0 |
194
|
|
|
for node in postorder(root): |
195
|
|
|
if node.is_leaf: |
196
|
|
|
T[node] = Tree( |
197
|
|
|
node.value._replace(range=(leaf_idx, leaf_idx + 1)), ()) |
198
|
|
|
leaf_idx += 1 |
199
|
|
|
else: |
200
|
|
|
left, right = T[node.left].value, T[node.right].value |
201
|
|
|
assert left.first < right.first |
202
|
|
|
|
203
|
|
|
t = Tree( |
204
|
|
|
node.value._replace(range=(left.range[0], right.range[1])), |
205
|
|
|
tuple(T[ch] for ch in node.branches) |
206
|
|
|
) |
207
|
|
|
assert t.value.range[0] <= t.value.range[-1] |
208
|
|
|
assert left.first == t.value.first and right.last == t.value.last |
209
|
|
|
assert t.value.first < right.first |
210
|
|
|
assert t.value.last > left.last |
211
|
|
|
T[node] = t |
212
|
|
|
|
213
|
|
|
return T[root] |
214
|
|
|
|
215
|
|
|
|
216
|
|
|
def postorder(tree, branches=attrgetter("branches")): |
217
|
|
|
stack = deque([tree]) |
218
|
|
|
visited = set() |
219
|
|
|
|
220
|
|
|
while stack: |
221
|
|
|
current = stack.popleft() |
222
|
|
|
children = branches(current) |
223
|
|
|
if children: |
224
|
|
|
# yield the item on the way up |
225
|
|
|
if current in visited: |
226
|
|
|
yield current |
227
|
|
|
else: |
228
|
|
|
# stack = children + [current] + stack |
229
|
|
|
stack.extendleft([current]) |
230
|
|
|
stack.extendleft(reversed(children)) |
231
|
|
|
visited.add(current) |
232
|
|
|
|
233
|
|
|
else: |
234
|
|
|
yield current |
235
|
|
|
visited.add(current) |
236
|
|
|
|
237
|
|
|
|
238
|
|
|
def preorder(tree, branches=attrgetter("branches")): |
239
|
|
|
stack = deque([tree]) |
240
|
|
|
while stack: |
241
|
|
|
current = stack.popleft() |
242
|
|
|
yield current |
243
|
|
|
children = branches(current) |
244
|
|
|
if children: |
245
|
|
|
stack.extendleft(reversed(children)) |
246
|
|
|
|
247
|
|
|
|
248
|
|
|
def leaves(tree, branches=attrgetter("branches")): |
249
|
|
|
""" |
250
|
|
|
Return an iterator over the leaf nodes in a tree structure. |
251
|
|
|
""" |
252
|
|
|
return (node for node in postorder(tree, branches) |
253
|
|
|
if node.is_leaf) |
254
|
|
|
|
255
|
|
|
|
256
|
|
|
def prune(cluster, level=None, height=None, condition=None): |
257
|
|
|
""" |
258
|
|
|
Prune the clustering instance ``cluster``. |
259
|
|
|
|
260
|
|
|
:param Tree cluster: Cluster root node to prune. |
261
|
|
|
:param int level: If not `None` prune all clusters deeper then `level`. |
262
|
|
|
:param float height: |
263
|
|
|
If not `None` prune all clusters with height lower then `height`. |
264
|
|
|
:param function condition: |
265
|
|
|
If not `None condition must be a `Tree -> bool` function |
266
|
|
|
evaluating to `True` if the cluster should be pruned. |
267
|
|
|
|
268
|
|
|
.. note:: |
269
|
|
|
At least one `level`, `height` or `condition` argument needs to |
270
|
|
|
be supplied. |
271
|
|
|
|
272
|
|
|
""" |
273
|
|
|
if not any(arg is not None for arg in [level, height, condition]): |
274
|
|
|
raise ValueError("At least one pruning argument must be supplied") |
275
|
|
|
|
276
|
|
|
level_check = height_check = condition_check = lambda cl: False |
277
|
|
|
|
278
|
|
|
if level is not None: |
279
|
|
|
cluster_depth = cluster_depths(cluster) |
280
|
|
|
level_check = lambda cl: cluster_depth[cl] >= level |
281
|
|
|
|
282
|
|
|
if height is not None: |
283
|
|
|
height_check = lambda cl: cl.value.height <= height |
284
|
|
|
|
285
|
|
|
if condition is not None: |
286
|
|
|
condition_check = condition |
287
|
|
|
|
288
|
|
|
def check_all(cl): |
289
|
|
|
return level_check(cl) or height_check(cl) or condition_check(cl) |
290
|
|
|
|
291
|
|
|
T = {} |
292
|
|
|
|
293
|
|
|
for node in postorder(cluster): |
294
|
|
|
if check_all(node): |
295
|
|
|
if node.is_leaf: |
296
|
|
|
T[node] = node |
297
|
|
|
else: |
298
|
|
|
T[node] = Tree(node.value, ()) |
299
|
|
|
else: |
300
|
|
|
T[node] = Tree(node.value, |
301
|
|
|
tuple(T[ch] for ch in node.branches)) |
302
|
|
|
return T[cluster] |
303
|
|
|
|
304
|
|
|
|
305
|
|
|
def cluster_depths(cluster): |
306
|
|
|
""" |
307
|
|
|
Return a dictionary mapping :class:`Tree` instances to their depth. |
308
|
|
|
|
309
|
|
|
:param Tree cluster: Root cluster |
310
|
|
|
:rtype: class:`dict` |
311
|
|
|
|
312
|
|
|
""" |
313
|
|
|
depths = {} |
314
|
|
|
depths[cluster] = 0 |
315
|
|
|
for cluster in preorder(cluster): |
316
|
|
|
cl_depth = depths[cluster] |
317
|
|
|
depths.update(dict.fromkeys(cluster.branches, cl_depth + 1)) |
318
|
|
|
return depths |
319
|
|
|
|
320
|
|
|
|
321
|
|
|
def top_clusters(tree, k): |
322
|
|
|
""" |
323
|
|
|
Return `k` topmost clusters from hierarchical clustering. |
324
|
|
|
|
325
|
|
|
:param Tree root: Root cluster. |
326
|
|
|
:param int k: Number of top clusters. |
327
|
|
|
|
328
|
|
|
:rtype: list of :class:`Tree` instances |
329
|
|
|
""" |
330
|
|
|
def item(node): |
331
|
|
|
return ((node.is_leaf, -node.value.height), node) |
332
|
|
|
|
333
|
|
|
heap = [item(tree)] |
334
|
|
|
|
335
|
|
|
while len(heap) < k: |
336
|
|
|
key, cl = heapq.heappop(heap) |
337
|
|
|
if cl.is_leaf: |
338
|
|
|
assert all(n.is_leaf for _, n in heap) |
339
|
|
|
heapq.heappush(heap, (key, cl)) |
340
|
|
|
break |
341
|
|
|
|
342
|
|
|
left, right = cl.left, cl.right |
343
|
|
|
heapq.heappush(heap, item(left)) |
344
|
|
|
heapq.heappush(heap, item(right)) |
345
|
|
|
|
346
|
|
|
return [n for _, n in heap] |
347
|
|
|
|
348
|
|
|
|
349
|
|
|
def optimal_leaf_ordering(tree, distances, progress_callback=None): |
350
|
|
|
""" |
351
|
|
|
Order the leaves in the clustering tree. |
352
|
|
|
|
353
|
|
|
(based on Ziv Bar-Joseph et al. Fast optimal leaf ordering for |
354
|
|
|
hierarchical clustering) |
355
|
|
|
|
356
|
|
|
:param Tree tree: |
357
|
|
|
Binary hierarchical clustering tree. |
358
|
|
|
:param numpy.ndarray distances: |
359
|
|
|
A (N, N) numpy.ndarray of distances that were used to compute |
360
|
|
|
the clustering. |
361
|
|
|
:param function progress_callback: |
362
|
|
|
Function used to report on progress. |
363
|
|
|
|
364
|
|
|
""" |
365
|
|
|
distances = numpy.asarray(distances) |
366
|
|
|
M = numpy.zeros_like(distances) |
367
|
|
|
|
368
|
|
|
# rearrange distances by order defined by tree's leaves |
369
|
|
|
indices = numpy.array([leaf.value.index for leaf in leaves(tree)]) |
370
|
|
|
distances = distances[indices[numpy.newaxis, :], |
371
|
|
|
indices[:, numpy.newaxis]] |
372
|
|
|
distances = numpy.ascontiguousarray(distances) |
373
|
|
|
|
374
|
|
|
# This is the 'fast' early termination search described in the paper |
375
|
|
|
# (it is slower in the pure python implementation) |
376
|
|
|
def argmin_ordered_xpypZ(x, y, Z, sorter_x=None, sorter_y=None): |
|
|
|
|
377
|
|
|
C = numpy.min(Z) |
378
|
|
|
if sorter_x is None: |
379
|
|
|
sorter_x = range(len(x)) |
380
|
|
|
ordered_x = x |
381
|
|
|
else: |
382
|
|
|
ordered_x = x[sorter_x] |
383
|
|
|
if sorter_y is None: |
384
|
|
|
sorter_y = range(len(y)) |
385
|
|
|
ordered_y = y |
386
|
|
|
else: |
387
|
|
|
ordered_y = y[sorter_y] |
388
|
|
|
|
389
|
|
|
y0 = ordered_y[0] |
390
|
|
|
|
391
|
|
|
best_val = numpy.inf |
392
|
|
|
best_i, best_j = 0, 0 |
393
|
|
|
|
394
|
|
|
y0pC = y0 + C |
395
|
|
|
for i, x in zip(sorter_x, ordered_x): |
396
|
|
|
if x + y0pC >= best_val: |
397
|
|
|
break |
398
|
|
|
xpC = x + C |
399
|
|
|
for j, y in zip(sorter_y, ordered_y): |
400
|
|
|
if xpC + y >= best_val: |
401
|
|
|
break |
402
|
|
|
val = x + y + Z[i, j] |
403
|
|
|
if val < best_val: |
404
|
|
|
best_val, best_i, best_j = val, i, j |
405
|
|
|
return best_i, best_j |
406
|
|
|
|
407
|
|
|
def argmin_xpypZ(x, y, Z): |
408
|
|
|
_, h = Z.shape |
409
|
|
|
A = Z + numpy.reshape(x, (-1, 1)) |
410
|
|
|
A += numpy.reshape(y, (1, -1)) |
411
|
|
|
i = numpy.argmin(A) |
412
|
|
|
return i // h, i % h |
413
|
|
|
|
414
|
|
|
def optimal_ordering(tree, M, ordering): |
415
|
|
|
if tree.is_leaf: |
416
|
|
|
M[tree.value.first, tree.value.first] = 0.0 |
417
|
|
|
else: |
418
|
|
|
left, right = tree.left, tree.right |
419
|
|
|
if not left.is_leaf: |
420
|
|
|
V_ll = range(*left.left.value.range) |
421
|
|
|
V_lr = range(*left.right.value.range) |
422
|
|
|
u_iter = chain(((u, V_lr) for u in V_ll), |
423
|
|
|
((u, V_ll) for u in V_lr)) |
424
|
|
|
else: |
425
|
|
|
V_lr = range(*left.value.range) |
426
|
|
|
u_iter = ((u, V_lr) for u in V_lr) |
427
|
|
|
|
428
|
|
|
u_iter = list(u_iter) |
429
|
|
|
assert [u for u, _ in u_iter] == list(range(*left.value.range)) |
430
|
|
|
|
431
|
|
|
if not right.is_leaf: |
432
|
|
|
V_rl = range(*right.left.value.range) |
433
|
|
|
V_rr = range(*right.right.value.range) |
434
|
|
|
w_iter = chain(((w, V_rr) for w in V_rl), |
435
|
|
|
((w, V_rl) for w in V_rr)) |
436
|
|
|
else: |
437
|
|
|
V_rr = range(*right.value.range) |
438
|
|
|
w_iter = ((w, V_rr) for w in V_rr) |
439
|
|
|
|
440
|
|
|
w_iter = list(w_iter) |
441
|
|
|
assert [w for w, _ in w_iter] == list(range(*right.value.range)) |
442
|
|
|
|
443
|
|
|
for u, left_inner in u_iter: |
444
|
|
|
left_inner_slice = slice(left_inner.start, left_inner.stop) |
445
|
|
|
M_left_inner = M[u, left_inner_slice] |
446
|
|
|
# left_inner_sort = numpy.argsort(M_left_inner) |
447
|
|
|
for w, right_inner in w_iter: |
448
|
|
|
right_inner_slice = slice(right_inner.start, |
449
|
|
|
right_inner.stop) |
450
|
|
|
M_right_inner = M[w, right_inner_slice] |
451
|
|
|
# right_inner_sort = numpy.argsort(M_right_inner) |
452
|
|
|
# i, j = argmin_ordered_xpypZ( |
453
|
|
|
# M_left_inner, M_right_inner, |
454
|
|
|
# distances[left_inner_slice, right_inner_slice], |
455
|
|
|
# sorter_x=left_inner_sort, sorter_y=right_inner_sort, |
456
|
|
|
# ) |
457
|
|
|
i, j = argmin_xpypZ( |
458
|
|
|
M_left_inner, M_right_inner, |
459
|
|
|
distances[left_inner_slice, right_inner_slice] |
460
|
|
|
) |
461
|
|
|
m, k = left_inner.start + i, right_inner.start + j |
462
|
|
|
score = M[u, m] + M[k, w] + distances[m, k] |
463
|
|
|
M[u, w] = M[w, u] = score |
464
|
|
|
ordering[u, w] = (u, m, k, w) |
465
|
|
|
|
466
|
|
|
return M, ordering |
467
|
|
|
|
468
|
|
|
subtrees = list(postorder(tree)) |
469
|
|
|
ordering = {} |
470
|
|
|
|
471
|
|
|
for i, subtree in enumerate(subtrees): |
472
|
|
|
M, ordering = optimal_ordering(subtree, M, ordering) |
473
|
|
|
|
474
|
|
|
if progress_callback: |
475
|
|
|
progress_callback(100.0 * i / len(subtrees)) |
476
|
|
|
|
477
|
|
|
def min_uw(tree, u=None, w=None): |
478
|
|
|
if tree.is_leaf: |
479
|
|
|
return 0.0 |
480
|
|
|
else: |
481
|
|
|
if u is None: |
482
|
|
|
U = slice(*tree.left.value.range) |
483
|
|
|
else: |
484
|
|
|
U = slice(u, u + 1) |
485
|
|
|
if w is None: |
486
|
|
|
W = slice(*tree.right.value.range) |
487
|
|
|
else: |
488
|
|
|
W = slice(w, w + 1) |
489
|
|
|
|
490
|
|
|
M_ = M[U, W] |
491
|
|
|
_, w = M_.shape |
492
|
|
|
i = numpy.argmin(M_.ravel()) |
493
|
|
|
i, j = i // w, i % w |
494
|
|
|
return U.start + i, W.start + j |
495
|
|
|
|
496
|
|
|
def optimal_swap(root, M): |
|
|
|
|
497
|
|
|
opt_uw = {root: min_uw(root)} |
498
|
|
|
# run down the tree applying u, w constraints from parents. |
499
|
|
|
for tree in preorder(root): |
500
|
|
|
if tree.is_leaf: |
501
|
|
|
pass |
502
|
|
|
else: |
503
|
|
|
u, w = opt_uw[tree] |
504
|
|
|
assert u in range(*tree.value.range) |
505
|
|
|
assert w in range(*tree.value.range) |
506
|
|
|
if u < w: |
507
|
|
|
u, m, k, w = ordering[u, w] |
508
|
|
|
|
509
|
|
|
opt_uw[tree.left] = (u, m) |
510
|
|
|
opt_uw[tree.right] = (k, w) |
511
|
|
|
else: |
512
|
|
|
w, k, m, u = ordering[w, u] |
513
|
|
|
opt_uw[tree.right] = (u, m) |
514
|
|
|
|
515
|
|
|
opt_uw[tree.left] = (k, w) |
516
|
|
|
|
517
|
|
|
def is_swapped(tree): |
518
|
|
|
"Is `tree` swapped based on computed optimal ordering" |
519
|
|
|
if tree.is_leaf: |
520
|
|
|
return False |
521
|
|
|
else: |
522
|
|
|
u, w = opt_uw[tree] |
523
|
|
|
return u > w |
524
|
|
|
|
525
|
|
|
def swaped_branches(tree): |
526
|
|
|
"Return branches from `tree` in optimal order" |
527
|
|
|
if tree.is_leaf: |
528
|
|
|
return () |
529
|
|
|
elif is_swapped(tree): |
530
|
|
|
return tree.branches |
531
|
|
|
else: |
532
|
|
|
return tuple(reversed(tree.branches)) |
533
|
|
|
|
534
|
|
|
# Create a new tree structure with optimally swapped branches. |
535
|
|
|
T = {} |
536
|
|
|
counter = count(0) |
537
|
|
|
for tree in postorder(root, branches=swaped_branches): |
538
|
|
|
if tree.is_leaf: |
539
|
|
|
# we need to 're-enumerate' the leaves |
540
|
|
|
i = next(counter) |
541
|
|
|
T[tree] = Tree(tree.value._replace(range=(i, i + 1)), ()) |
542
|
|
|
else: |
543
|
|
|
left, right = T[tree.left], T[tree.right] |
544
|
|
|
|
545
|
|
|
if left.value.first > right.value.first: |
546
|
|
|
right, left = left, right |
547
|
|
|
|
548
|
|
|
assert left.value.first < right.value.last |
549
|
|
|
assert left.value.last == right.value.first |
550
|
|
|
|
551
|
|
|
T[tree] = Tree(tree.value._replace(range=(left.value.first, |
552
|
|
|
right.value.last)), |
553
|
|
|
(left, right)) |
554
|
|
|
return T[root] |
555
|
|
|
|
556
|
|
|
return optimal_swap(tree, M) |
557
|
|
|
|
558
|
|
|
|
559
|
|
|
class HierarchicalClustering: |
560
|
|
|
def __init__(self, n_clusters=2, linkage=AVERAGE): |
561
|
|
|
self.n_clusters = n_clusters |
562
|
|
|
self.linkage = linkage |
563
|
|
|
|
564
|
|
|
def fit(self, X): |
565
|
|
|
self.tree = dist_matrix_clustering(X, linkage=self.linkage) |
|
|
|
|
566
|
|
|
cut = top_clusters(self.tree, self.n_clusters) |
567
|
|
|
labels = numpy.zeros(self.tree.value.last) |
568
|
|
|
|
569
|
|
|
for i, cl in enumerate(cut): |
570
|
|
|
indices = [leaf.value.index for leaf in leaves(cl)] |
571
|
|
|
labels[indices] = i |
572
|
|
|
|
573
|
|
|
self.labels = labels |
|
|
|
|
574
|
|
|
|
575
|
|
|
def fit_predict(self, X, y=None): |
|
|
|
|
576
|
|
|
self.fit(X) |
577
|
|
|
return self.labels |
578
|
|
|
|
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.py
files in your module folders. Make sure that you place one file in each sub-folder.