1 | from collections import namedtuple, deque |
||
2 | from operator import attrgetter |
||
3 | from itertools import chain, count |
||
4 | import heapq |
||
5 | import numpy |
||
0 ignored issues
–
show
|
|||
6 | |||
7 | import scipy.cluster.hierarchy |
||
0 ignored issues
–
show
The import
scipy.cluster.hierarchy could not be resolved.
This can be caused by one of the following: 1. Missing DependenciesThis error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml
before_commands:
- sudo pip install abc # Python2
- sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use
the command for the correct version.
2. Missing __init__.py filesThis error could also result from missing ![]() |
|||
8 | from Orange.distance import Euclidean, PearsonR |
||
9 | |||
10 | __all__ = ['HierarchicalClustering'] |
||
11 | |||
12 | SINGLE = "single" |
||
13 | AVERAGE = "average" |
||
14 | COMPLETE = "complete" |
||
15 | WEIGHTED = "weighted" |
||
16 | WARD = "ward" |
||
17 | |||
18 | |||
19 | def condensedform(X, mode="upper"): |
||
20 | X = numpy.asarray(X) |
||
21 | assert len(X.shape) == 2 |
||
22 | assert X.shape[0] == X.shape[1] |
||
23 | |||
24 | N = X.shape[0] |
||
25 | |||
26 | if mode == "upper": |
||
27 | i, j = numpy.triu_indices(N, k=1) |
||
28 | elif mode == "lower": |
||
29 | i, j = numpy.tril_indices(N, k=-1) |
||
30 | else: |
||
31 | raise ValueError("invalid mode") |
||
32 | return X[i, j] |
||
33 | |||
34 | |||
35 | def squareform(X, mode="upper"): |
||
36 | X = numpy.asarray(X) |
||
37 | k = X.shape[0] |
||
38 | N = int(numpy.ceil(numpy.sqrt(k * 2))) |
||
39 | assert N * (N - 1) // 2 == k |
||
40 | matrix = numpy.zeros((N, N), dtype=X.dtype) |
||
41 | if mode == "upper": |
||
42 | i, j = numpy.triu_indices(N, k=1) |
||
43 | matrix[i, j] = X |
||
44 | m, n = numpy.tril_indices(N, k=-1) |
||
45 | matrix[m, n] = matrix.T[m, n] |
||
46 | elif mode == "lower": |
||
47 | i, j = numpy.tril_indices(N, k=-1) |
||
48 | matrix[i, j] = X |
||
49 | m, n = numpy.triu_indices(N, k=1) |
||
50 | matrix[m, n] = matrix.T[m, n] |
||
51 | return matrix |
||
52 | |||
53 | |||
54 | def data_clustering(data, distance=Euclidean, |
||
55 | linkage=AVERAGE): |
||
56 | """ |
||
57 | Return the hierarchical clustering of the data set's rows. |
||
58 | |||
59 | :param Orange.data.Table data: Data set to cluster. |
||
60 | :param Orange.distance.Distance distance: A distance measure. |
||
61 | :param str linkage: |
||
62 | """ |
||
63 | matrix = distance(data) |
||
64 | return dist_matrix_clustering(matrix, linkage=linkage) |
||
65 | |||
66 | |||
67 | def feature_clustering(data, distance=PearsonR, |
||
68 | linkage=AVERAGE): |
||
69 | """ |
||
70 | Return the hierarchical clustering of the data set's columns. |
||
71 | |||
72 | :param Orange.data.Table data: Data set to cluster. |
||
73 | :param Orange.distance.Distance distance: A distance measure. |
||
74 | :param str linkage: |
||
75 | """ |
||
76 | matrix = distance(data, axis=0) |
||
77 | return dist_matrix_clustering(matrix, linkage=linkage) |
||
78 | |||
79 | |||
80 | |||
81 | def dist_matrix_linkage(matrix, linkage=AVERAGE): |
||
82 | """ |
||
83 | Return linkage using a precomputed distance matrix. |
||
84 | |||
85 | :param Orange.misc.DistMatrix matrix: |
||
86 | :param str linkage: |
||
87 | """ |
||
88 | # Extract compressed upper triangular distance matrix. |
||
89 | distances = condensedform(matrix) |
||
90 | return scipy.cluster.hierarchy.linkage(distances, method=linkage) |
||
91 | |||
92 | |||
93 | def dist_matrix_clustering(matrix, linkage=AVERAGE): |
||
94 | """ |
||
95 | Return the hierarchical clustering using a precomputed distance matrix. |
||
96 | |||
97 | :param Orange.misc.DistMatrix matrix: |
||
98 | :param str linkage: |
||
99 | """ |
||
100 | # Extract compressed upper triangular distance matrix. |
||
101 | distances = condensedform(matrix) |
||
102 | Z = scipy.cluster.hierarchy.linkage(distances, method=linkage) |
||
103 | return tree_from_linkage(Z) |
||
104 | |||
105 | |||
106 | def sample_clustering(X, linkage=AVERAGE, metric="euclidean"): |
||
107 | assert len(X.shape) == 2 |
||
108 | Z = scipy.cluster.hierarchy.linkage(X, method=linkage, metric=metric) |
||
109 | return tree_from_linkage(Z) |
||
110 | |||
111 | |||
112 | Tree = namedtuple("Tree", ["value", "branches"]) |
||
113 | |||
114 | ClusterData = namedtuple("Cluster", ["range", "height"]) |
||
115 | SingletonData = namedtuple("Singleton", ["range", "height", "index"]) |
||
116 | |||
117 | |||
118 | class _Ranged: |
||
119 | |||
120 | @property |
||
121 | def first(self): |
||
122 | return self.range[0] |
||
123 | |||
124 | @property |
||
125 | def last(self): |
||
126 | return self.range[-1] |
||
127 | |||
128 | |||
129 | class ClusterData(ClusterData, _Ranged): |
||
0 ignored issues
–
show
|
|||
130 | __slots__ = () |
||
131 | |||
132 | |||
133 | class SingletonData(SingletonData, _Ranged): |
||
0 ignored issues
–
show
|
|||
134 | __slots__ = () |
||
135 | |||
136 | |||
137 | class Tree(Tree): |
||
0 ignored issues
–
show
|
|||
138 | def __new__(cls, value, branches=()): |
||
139 | if not isinstance(branches, tuple): |
||
140 | raise TypeError() |
||
141 | |||
142 | return super().__new__(cls, value, branches) |
||
143 | |||
144 | @property |
||
145 | def is_leaf(self): |
||
146 | return not bool(self.branches) |
||
147 | |||
148 | @property |
||
149 | def left(self): |
||
150 | return self.branches[0] |
||
151 | |||
152 | @property |
||
153 | def right(self): |
||
154 | return self.branches[-1] |
||
155 | |||
156 | |||
157 | def tree_from_linkage(linkage): |
||
158 | """ |
||
159 | Return a Tree representation of a clustering encoded in a linkage matrix. |
||
160 | |||
161 | .. seealso:: scipy.cluster.hierarchy.linkage |
||
162 | |||
163 | """ |
||
164 | T = {} |
||
165 | N, _ = linkage.shape |
||
166 | N = N + 1 |
||
167 | order = [] |
||
168 | for i, (c1, c2, d, _) in enumerate(linkage): |
||
169 | if c1 < N: |
||
170 | left = Tree(SingletonData(range=(len(order), len(order) + 1), |
||
171 | height=0.0, index=int(c1)), |
||
172 | ()) |
||
173 | order.append(c1) |
||
174 | else: |
||
175 | left = T[c1] |
||
176 | |||
177 | if c2 < N: |
||
178 | right = Tree(SingletonData(range=(len(order), len(order) + 1), |
||
179 | height=0.0, index=int(c2)), |
||
180 | ()) |
||
181 | order.append(c2) |
||
182 | else: |
||
183 | right = T[c2] |
||
184 | |||
185 | t = Tree(ClusterData(range=(left.value.first, right.value.last), |
||
186 | height=d), |
||
187 | (left, right)) |
||
188 | T[N + i] = t |
||
189 | |||
190 | root = T[N + N - 2] |
||
191 | T = {} |
||
192 | |||
193 | leaf_idx = 0 |
||
194 | for node in postorder(root): |
||
195 | if node.is_leaf: |
||
196 | T[node] = Tree( |
||
197 | node.value._replace(range=(leaf_idx, leaf_idx + 1)), ()) |
||
198 | leaf_idx += 1 |
||
199 | else: |
||
200 | left, right = T[node.left].value, T[node.right].value |
||
201 | assert left.first < right.first |
||
202 | |||
203 | t = Tree( |
||
204 | node.value._replace(range=(left.range[0], right.range[1])), |
||
205 | tuple(T[ch] for ch in node.branches) |
||
206 | ) |
||
207 | assert t.value.range[0] <= t.value.range[-1] |
||
208 | assert left.first == t.value.first and right.last == t.value.last |
||
209 | assert t.value.first < right.first |
||
210 | assert t.value.last > left.last |
||
211 | T[node] = t |
||
212 | |||
213 | return T[root] |
||
214 | |||
215 | |||
216 | def postorder(tree, branches=attrgetter("branches")): |
||
217 | stack = deque([tree]) |
||
218 | visited = set() |
||
219 | |||
220 | while stack: |
||
221 | current = stack.popleft() |
||
222 | children = branches(current) |
||
223 | if children: |
||
224 | # yield the item on the way up |
||
225 | if current in visited: |
||
226 | yield current |
||
227 | else: |
||
228 | # stack = children + [current] + stack |
||
229 | stack.extendleft([current]) |
||
230 | stack.extendleft(reversed(children)) |
||
231 | visited.add(current) |
||
232 | |||
233 | else: |
||
234 | yield current |
||
235 | visited.add(current) |
||
236 | |||
237 | |||
238 | def preorder(tree, branches=attrgetter("branches")): |
||
239 | stack = deque([tree]) |
||
240 | while stack: |
||
241 | current = stack.popleft() |
||
242 | yield current |
||
243 | children = branches(current) |
||
244 | if children: |
||
245 | stack.extendleft(reversed(children)) |
||
246 | |||
247 | |||
248 | def leaves(tree, branches=attrgetter("branches")): |
||
249 | """ |
||
250 | Return an iterator over the leaf nodes in a tree structure. |
||
251 | """ |
||
252 | return (node for node in postorder(tree, branches) |
||
253 | if node.is_leaf) |
||
254 | |||
255 | |||
256 | def prune(cluster, level=None, height=None, condition=None): |
||
257 | """ |
||
258 | Prune the clustering instance ``cluster``. |
||
259 | |||
260 | :param Tree cluster: Cluster root node to prune. |
||
261 | :param int level: If not `None` prune all clusters deeper then `level`. |
||
262 | :param float height: |
||
263 | If not `None` prune all clusters with height lower then `height`. |
||
264 | :param function condition: |
||
265 | If not `None condition must be a `Tree -> bool` function |
||
266 | evaluating to `True` if the cluster should be pruned. |
||
267 | |||
268 | .. note:: |
||
269 | At least one `level`, `height` or `condition` argument needs to |
||
270 | be supplied. |
||
271 | |||
272 | """ |
||
273 | if not any(arg is not None for arg in [level, height, condition]): |
||
274 | raise ValueError("At least one pruning argument must be supplied") |
||
275 | |||
276 | level_check = height_check = condition_check = lambda cl: False |
||
277 | |||
278 | if level is not None: |
||
279 | cluster_depth = cluster_depths(cluster) |
||
280 | level_check = lambda cl: cluster_depth[cl] >= level |
||
281 | |||
282 | if height is not None: |
||
283 | height_check = lambda cl: cl.value.height <= height |
||
284 | |||
285 | if condition is not None: |
||
286 | condition_check = condition |
||
287 | |||
288 | def check_all(cl): |
||
289 | return level_check(cl) or height_check(cl) or condition_check(cl) |
||
290 | |||
291 | T = {} |
||
292 | |||
293 | for node in postorder(cluster): |
||
294 | if check_all(node): |
||
295 | if node.is_leaf: |
||
296 | T[node] = node |
||
297 | else: |
||
298 | T[node] = Tree(node.value, ()) |
||
299 | else: |
||
300 | T[node] = Tree(node.value, |
||
301 | tuple(T[ch] for ch in node.branches)) |
||
302 | return T[cluster] |
||
303 | |||
304 | |||
305 | def cluster_depths(cluster): |
||
306 | """ |
||
307 | Return a dictionary mapping :class:`Tree` instances to their depth. |
||
308 | |||
309 | :param Tree cluster: Root cluster |
||
310 | :rtype: class:`dict` |
||
311 | |||
312 | """ |
||
313 | depths = {} |
||
314 | depths[cluster] = 0 |
||
315 | for cluster in preorder(cluster): |
||
316 | cl_depth = depths[cluster] |
||
317 | depths.update(dict.fromkeys(cluster.branches, cl_depth + 1)) |
||
318 | return depths |
||
319 | |||
320 | |||
321 | def top_clusters(tree, k): |
||
322 | """ |
||
323 | Return `k` topmost clusters from hierarchical clustering. |
||
324 | |||
325 | :param Tree root: Root cluster. |
||
326 | :param int k: Number of top clusters. |
||
327 | |||
328 | :rtype: list of :class:`Tree` instances |
||
329 | """ |
||
330 | def item(node): |
||
331 | return ((node.is_leaf, -node.value.height), node) |
||
332 | |||
333 | heap = [item(tree)] |
||
334 | |||
335 | while len(heap) < k: |
||
336 | key, cl = heapq.heappop(heap) |
||
337 | if cl.is_leaf: |
||
338 | assert all(n.is_leaf for _, n in heap) |
||
339 | heapq.heappush(heap, (key, cl)) |
||
340 | break |
||
341 | |||
342 | left, right = cl.left, cl.right |
||
343 | heapq.heappush(heap, item(left)) |
||
344 | heapq.heappush(heap, item(right)) |
||
345 | |||
346 | return [n for _, n in heap] |
||
347 | |||
348 | |||
349 | def optimal_leaf_ordering(tree, distances, progress_callback=None): |
||
350 | """ |
||
351 | Order the leaves in the clustering tree. |
||
352 | |||
353 | (based on Ziv Bar-Joseph et al. Fast optimal leaf ordering for |
||
354 | hierarchical clustering) |
||
355 | |||
356 | :param Tree tree: |
||
357 | Binary hierarchical clustering tree. |
||
358 | :param numpy.ndarray distances: |
||
359 | A (N, N) numpy.ndarray of distances that were used to compute |
||
360 | the clustering. |
||
361 | :param function progress_callback: |
||
362 | Function used to report on progress. |
||
363 | |||
364 | """ |
||
365 | distances = numpy.asarray(distances) |
||
366 | M = numpy.zeros_like(distances) |
||
367 | |||
368 | # rearrange distances by order defined by tree's leaves |
||
369 | indices = numpy.array([leaf.value.index for leaf in leaves(tree)]) |
||
370 | distances = distances[indices[numpy.newaxis, :], |
||
371 | indices[:, numpy.newaxis]] |
||
372 | distances = numpy.ascontiguousarray(distances) |
||
373 | |||
374 | # This is the 'fast' early termination search described in the paper |
||
375 | # (it is slower in the pure python implementation) |
||
376 | def argmin_ordered_xpypZ(x, y, Z, sorter_x=None, sorter_y=None): |
||
0 ignored issues
–
show
|
|||
377 | C = numpy.min(Z) |
||
378 | if sorter_x is None: |
||
379 | sorter_x = range(len(x)) |
||
380 | ordered_x = x |
||
381 | else: |
||
382 | ordered_x = x[sorter_x] |
||
383 | if sorter_y is None: |
||
384 | sorter_y = range(len(y)) |
||
385 | ordered_y = y |
||
386 | else: |
||
387 | ordered_y = y[sorter_y] |
||
388 | |||
389 | y0 = ordered_y[0] |
||
390 | |||
391 | best_val = numpy.inf |
||
392 | best_i, best_j = 0, 0 |
||
393 | |||
394 | y0pC = y0 + C |
||
395 | for i, x in zip(sorter_x, ordered_x): |
||
396 | if x + y0pC >= best_val: |
||
397 | break |
||
398 | xpC = x + C |
||
399 | for j, y in zip(sorter_y, ordered_y): |
||
400 | if xpC + y >= best_val: |
||
401 | break |
||
402 | val = x + y + Z[i, j] |
||
403 | if val < best_val: |
||
404 | best_val, best_i, best_j = val, i, j |
||
405 | return best_i, best_j |
||
406 | |||
407 | def argmin_xpypZ(x, y, Z): |
||
408 | _, h = Z.shape |
||
409 | A = Z + numpy.reshape(x, (-1, 1)) |
||
410 | A += numpy.reshape(y, (1, -1)) |
||
411 | i = numpy.argmin(A) |
||
412 | return i // h, i % h |
||
413 | |||
414 | def optimal_ordering(tree, M, ordering): |
||
415 | if tree.is_leaf: |
||
416 | M[tree.value.first, tree.value.first] = 0.0 |
||
417 | else: |
||
418 | left, right = tree.left, tree.right |
||
419 | if not left.is_leaf: |
||
420 | V_ll = range(*left.left.value.range) |
||
421 | V_lr = range(*left.right.value.range) |
||
422 | u_iter = chain(((u, V_lr) for u in V_ll), |
||
423 | ((u, V_ll) for u in V_lr)) |
||
424 | else: |
||
425 | V_lr = range(*left.value.range) |
||
426 | u_iter = ((u, V_lr) for u in V_lr) |
||
427 | |||
428 | u_iter = list(u_iter) |
||
429 | assert [u for u, _ in u_iter] == list(range(*left.value.range)) |
||
430 | |||
431 | if not right.is_leaf: |
||
432 | V_rl = range(*right.left.value.range) |
||
433 | V_rr = range(*right.right.value.range) |
||
434 | w_iter = chain(((w, V_rr) for w in V_rl), |
||
435 | ((w, V_rl) for w in V_rr)) |
||
436 | else: |
||
437 | V_rr = range(*right.value.range) |
||
438 | w_iter = ((w, V_rr) for w in V_rr) |
||
439 | |||
440 | w_iter = list(w_iter) |
||
441 | assert [w for w, _ in w_iter] == list(range(*right.value.range)) |
||
442 | |||
443 | for u, left_inner in u_iter: |
||
444 | left_inner_slice = slice(left_inner.start, left_inner.stop) |
||
445 | M_left_inner = M[u, left_inner_slice] |
||
446 | # left_inner_sort = numpy.argsort(M_left_inner) |
||
447 | for w, right_inner in w_iter: |
||
448 | right_inner_slice = slice(right_inner.start, |
||
449 | right_inner.stop) |
||
450 | M_right_inner = M[w, right_inner_slice] |
||
451 | # right_inner_sort = numpy.argsort(M_right_inner) |
||
452 | # i, j = argmin_ordered_xpypZ( |
||
453 | # M_left_inner, M_right_inner, |
||
454 | # distances[left_inner_slice, right_inner_slice], |
||
455 | # sorter_x=left_inner_sort, sorter_y=right_inner_sort, |
||
456 | # ) |
||
457 | i, j = argmin_xpypZ( |
||
458 | M_left_inner, M_right_inner, |
||
459 | distances[left_inner_slice, right_inner_slice] |
||
460 | ) |
||
461 | m, k = left_inner.start + i, right_inner.start + j |
||
462 | score = M[u, m] + M[k, w] + distances[m, k] |
||
463 | M[u, w] = M[w, u] = score |
||
464 | ordering[u, w] = (u, m, k, w) |
||
465 | |||
466 | return M, ordering |
||
467 | |||
468 | subtrees = list(postorder(tree)) |
||
469 | ordering = {} |
||
470 | |||
471 | for i, subtree in enumerate(subtrees): |
||
472 | M, ordering = optimal_ordering(subtree, M, ordering) |
||
473 | |||
474 | if progress_callback: |
||
475 | progress_callback(100.0 * i / len(subtrees)) |
||
476 | |||
477 | def min_uw(tree, u=None, w=None): |
||
478 | if tree.is_leaf: |
||
479 | return 0.0 |
||
480 | else: |
||
481 | if u is None: |
||
482 | U = slice(*tree.left.value.range) |
||
483 | else: |
||
484 | U = slice(u, u + 1) |
||
485 | if w is None: |
||
486 | W = slice(*tree.right.value.range) |
||
487 | else: |
||
488 | W = slice(w, w + 1) |
||
489 | |||
490 | M_ = M[U, W] |
||
491 | _, w = M_.shape |
||
492 | i = numpy.argmin(M_.ravel()) |
||
493 | i, j = i // w, i % w |
||
494 | return U.start + i, W.start + j |
||
495 | |||
496 | def optimal_swap(root, M): |
||
0 ignored issues
–
show
|
|||
497 | opt_uw = {root: min_uw(root)} |
||
498 | # run down the tree applying u, w constraints from parents. |
||
499 | for tree in preorder(root): |
||
500 | if tree.is_leaf: |
||
501 | pass |
||
502 | else: |
||
503 | u, w = opt_uw[tree] |
||
504 | assert u in range(*tree.value.range) |
||
505 | assert w in range(*tree.value.range) |
||
506 | if u < w: |
||
507 | u, m, k, w = ordering[u, w] |
||
508 | |||
509 | opt_uw[tree.left] = (u, m) |
||
510 | opt_uw[tree.right] = (k, w) |
||
511 | else: |
||
512 | w, k, m, u = ordering[w, u] |
||
513 | opt_uw[tree.right] = (u, m) |
||
514 | |||
515 | opt_uw[tree.left] = (k, w) |
||
516 | |||
517 | def is_swapped(tree): |
||
518 | "Is `tree` swapped based on computed optimal ordering" |
||
519 | if tree.is_leaf: |
||
520 | return False |
||
521 | else: |
||
522 | u, w = opt_uw[tree] |
||
523 | return u > w |
||
524 | |||
525 | def swaped_branches(tree): |
||
526 | "Return branches from `tree` in optimal order" |
||
527 | if tree.is_leaf: |
||
528 | return () |
||
529 | elif is_swapped(tree): |
||
530 | return tree.branches |
||
531 | else: |
||
532 | return tuple(reversed(tree.branches)) |
||
533 | |||
534 | # Create a new tree structure with optimally swapped branches. |
||
535 | T = {} |
||
536 | counter = count(0) |
||
537 | for tree in postorder(root, branches=swaped_branches): |
||
538 | if tree.is_leaf: |
||
539 | # we need to 're-enumerate' the leaves |
||
540 | i = next(counter) |
||
541 | T[tree] = Tree(tree.value._replace(range=(i, i + 1)), ()) |
||
542 | else: |
||
543 | left, right = T[tree.left], T[tree.right] |
||
544 | |||
545 | if left.value.first > right.value.first: |
||
546 | right, left = left, right |
||
547 | |||
548 | assert left.value.first < right.value.last |
||
549 | assert left.value.last == right.value.first |
||
550 | |||
551 | T[tree] = Tree(tree.value._replace(range=(left.value.first, |
||
552 | right.value.last)), |
||
553 | (left, right)) |
||
554 | return T[root] |
||
555 | |||
556 | return optimal_swap(tree, M) |
||
557 | |||
558 | |||
559 | class HierarchicalClustering: |
||
560 | def __init__(self, n_clusters=2, linkage=AVERAGE): |
||
561 | self.n_clusters = n_clusters |
||
562 | self.linkage = linkage |
||
563 | |||
564 | def fit(self, X): |
||
565 | self.tree = dist_matrix_clustering(X, linkage=self.linkage) |
||
0 ignored issues
–
show
|
|||
566 | cut = top_clusters(self.tree, self.n_clusters) |
||
567 | labels = numpy.zeros(self.tree.value.last) |
||
568 | |||
569 | for i, cl in enumerate(cut): |
||
570 | indices = [leaf.value.index for leaf in leaves(cl)] |
||
571 | labels[indices] = i |
||
572 | |||
573 | self.labels = labels |
||
0 ignored issues
–
show
|
|||
574 | |||
575 | def fit_predict(self, X, y=None): |
||
0 ignored issues
–
show
|
|||
576 | self.fit(X) |
||
577 | return self.labels |
||
578 |
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.py
files in your module folders. Make sure that you place one file in each sub-folder.