GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.

Issues (4082)

Orange/clustering/hierarchical.py (10 issues)

1
from collections import namedtuple, deque
2
from operator import attrgetter
3
from itertools import chain, count
4
import heapq
5
import numpy
0 ignored issues
show
The import numpy could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
6
7
import scipy.cluster.hierarchy
0 ignored issues
show
The import scipy.cluster.hierarchy could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
8
from Orange.distance import Euclidean, PearsonR
9
10
__all__ = ['HierarchicalClustering']
11
12
SINGLE = "single"
13
AVERAGE = "average"
14
COMPLETE = "complete"
15
WEIGHTED = "weighted"
16
WARD = "ward"
17
18
19
def condensedform(X, mode="upper"):
20
    X = numpy.asarray(X)
21
    assert len(X.shape) == 2
22
    assert X.shape[0] == X.shape[1]
23
24
    N = X.shape[0]
25
26
    if mode == "upper":
27
        i, j = numpy.triu_indices(N, k=1)
28
    elif mode == "lower":
29
        i, j = numpy.tril_indices(N, k=-1)
30
    else:
31
        raise ValueError("invalid mode")
32
    return X[i, j]
33
34
35
def squareform(X, mode="upper"):
36
    X = numpy.asarray(X)
37
    k = X.shape[0]
38
    N = int(numpy.ceil(numpy.sqrt(k * 2)))
39
    assert N * (N - 1) // 2 == k
40
    matrix = numpy.zeros((N, N), dtype=X.dtype)
41
    if mode == "upper":
42
        i, j = numpy.triu_indices(N, k=1)
43
        matrix[i, j] = X
44
        m, n = numpy.tril_indices(N, k=-1)
45
        matrix[m, n] = matrix.T[m, n]
46
    elif mode == "lower":
47
        i, j = numpy.tril_indices(N, k=-1)
48
        matrix[i, j] = X
49
        m, n = numpy.triu_indices(N, k=1)
50
        matrix[m, n] = matrix.T[m, n]
51
    return matrix
52
53
54
def data_clustering(data, distance=Euclidean,
55
                    linkage=AVERAGE):
56
    """
57
    Return the hierarchical clustering of the data set's rows.
58
59
    :param Orange.data.Table data: Data set to cluster.
60
    :param Orange.distance.Distance distance: A distance measure.
61
    :param str linkage:
62
    """
63
    matrix = distance(data)
64
    return dist_matrix_clustering(matrix, linkage=linkage)
65
66
67
def feature_clustering(data, distance=PearsonR,
68
                       linkage=AVERAGE):
69
    """
70
    Return the hierarchical clustering of the data set's columns.
71
72
    :param Orange.data.Table data: Data set to cluster.
73
    :param Orange.distance.Distance distance: A distance measure.
74
    :param str linkage:
75
    """
76
    matrix = distance(data, axis=0)
77
    return dist_matrix_clustering(matrix, linkage=linkage)
78
79
80
81
def dist_matrix_linkage(matrix, linkage=AVERAGE):
82
    """
83
    Return linkage using a precomputed distance matrix.
84
85
    :param Orange.misc.DistMatrix matrix:
86
    :param str linkage:
87
    """
88
    # Extract compressed upper triangular distance matrix.
89
    distances = condensedform(matrix)
90
    return scipy.cluster.hierarchy.linkage(distances, method=linkage)
91
92
93
def dist_matrix_clustering(matrix, linkage=AVERAGE):
94
    """
95
    Return the hierarchical clustering using a precomputed distance matrix.
96
97
    :param Orange.misc.DistMatrix matrix:
98
    :param str linkage:
99
    """
100
    # Extract compressed upper triangular distance matrix.
101
    distances = condensedform(matrix)
102
    Z = scipy.cluster.hierarchy.linkage(distances, method=linkage)
103
    return tree_from_linkage(Z)
104
105
106
def sample_clustering(X, linkage=AVERAGE, metric="euclidean"):
107
    assert len(X.shape) == 2
108
    Z = scipy.cluster.hierarchy.linkage(X, method=linkage, metric=metric)
109
    return tree_from_linkage(Z)
110
111
112
Tree = namedtuple("Tree", ["value", "branches"])
113
114
ClusterData = namedtuple("Cluster", ["range", "height"])
115
SingletonData = namedtuple("Singleton", ["range", "height", "index"])
116
117
118
class _Ranged:
119
120
    @property
121
    def first(self):
122
        return self.range[0]
123
124
    @property
125
    def last(self):
126
        return self.range[-1]
127
128
129
class ClusterData(ClusterData, _Ranged):
0 ignored issues
show
class already defined line 114
Loading history...
130
    __slots__ = ()
131
132
133
class SingletonData(SingletonData, _Ranged):
0 ignored issues
show
class already defined line 115
Loading history...
134
    __slots__ = ()
135
136
137
class Tree(Tree):
0 ignored issues
show
class already defined line 112
Loading history...
138
    def __new__(cls, value, branches=()):
139
        if not isinstance(branches, tuple):
140
            raise TypeError()
141
142
        return super().__new__(cls, value, branches)
143
144
    @property
145
    def is_leaf(self):
146
        return not bool(self.branches)
147
148
    @property
149
    def left(self):
150
        return self.branches[0]
151
152
    @property
153
    def right(self):
154
        return self.branches[-1]
155
156
157
def tree_from_linkage(linkage):
158
    """
159
    Return a Tree representation of a clustering encoded in a linkage matrix.
160
161
    .. seealso:: scipy.cluster.hierarchy.linkage
162
163
    """
164
    T = {}
165
    N, _ = linkage.shape
166
    N = N + 1
167
    order = []
168
    for i, (c1, c2, d, _) in enumerate(linkage):
169
        if c1 < N:
170
            left = Tree(SingletonData(range=(len(order), len(order) + 1),
171
                                      height=0.0, index=int(c1)),
172
                        ())
173
            order.append(c1)
174
        else:
175
            left = T[c1]
176
177
        if c2 < N:
178
            right = Tree(SingletonData(range=(len(order), len(order) + 1),
179
                                       height=0.0, index=int(c2)),
180
                         ())
181
            order.append(c2)
182
        else:
183
            right = T[c2]
184
185
        t = Tree(ClusterData(range=(left.value.first, right.value.last),
186
                             height=d),
187
                 (left, right))
188
        T[N + i] = t
189
190
    root = T[N + N - 2]
191
    T = {}
192
193
    leaf_idx = 0
194
    for node in postorder(root):
195
        if node.is_leaf:
196
            T[node] = Tree(
197
                node.value._replace(range=(leaf_idx, leaf_idx + 1)), ())
198
            leaf_idx += 1
199
        else:
200
            left, right = T[node.left].value, T[node.right].value
201
            assert left.first < right.first
202
203
            t = Tree(
204
                node.value._replace(range=(left.range[0], right.range[1])),
205
                tuple(T[ch] for ch in node.branches)
206
            )
207
            assert t.value.range[0] <= t.value.range[-1]
208
            assert left.first == t.value.first and right.last == t.value.last
209
            assert t.value.first < right.first
210
            assert t.value.last > left.last
211
            T[node] = t
212
213
    return T[root]
214
215
216
def postorder(tree, branches=attrgetter("branches")):
217
    stack = deque([tree])
218
    visited = set()
219
220
    while stack:
221
        current = stack.popleft()
222
        children = branches(current)
223
        if children:
224
            # yield the item on the way up
225
            if current in visited:
226
                yield current
227
            else:
228
                # stack = children + [current] + stack
229
                stack.extendleft([current])
230
                stack.extendleft(reversed(children))
231
                visited.add(current)
232
233
        else:
234
            yield current
235
            visited.add(current)
236
237
238
def preorder(tree, branches=attrgetter("branches")):
239
    stack = deque([tree])
240
    while stack:
241
        current = stack.popleft()
242
        yield current
243
        children = branches(current)
244
        if children:
245
            stack.extendleft(reversed(children))
246
247
248
def leaves(tree, branches=attrgetter("branches")):
249
    """
250
    Return an iterator over the leaf nodes in a tree structure.
251
    """
252
    return (node for node in postorder(tree, branches)
253
            if node.is_leaf)
254
255
256
def prune(cluster, level=None, height=None, condition=None):
257
    """
258
    Prune the clustering instance ``cluster``.
259
260
    :param Tree cluster: Cluster root node to prune.
261
    :param int level: If not `None` prune all clusters deeper then `level`.
262
    :param float height:
263
        If not `None` prune all clusters with height lower then `height`.
264
    :param function condition:
265
        If not `None condition must be a `Tree -> bool` function
266
        evaluating to `True` if the cluster should be pruned.
267
268
    .. note::
269
        At least one `level`, `height` or `condition` argument needs to
270
        be supplied.
271
272
    """
273
    if not any(arg is not None for arg in [level, height, condition]):
274
        raise ValueError("At least one pruning argument must be supplied")
275
276
    level_check = height_check = condition_check = lambda cl: False
277
278
    if level is not None:
279
        cluster_depth = cluster_depths(cluster)
280
        level_check = lambda cl: cluster_depth[cl] >= level
281
282
    if height is not None:
283
        height_check = lambda cl: cl.value.height <= height
284
285
    if condition is not None:
286
        condition_check = condition
287
288
    def check_all(cl):
289
        return level_check(cl) or height_check(cl) or condition_check(cl)
290
291
    T = {}
292
293
    for node in postorder(cluster):
294
        if check_all(node):
295
            if node.is_leaf:
296
                T[node] = node
297
            else:
298
                T[node] = Tree(node.value, ())
299
        else:
300
            T[node] = Tree(node.value,
301
                           tuple(T[ch] for ch in node.branches))
302
    return T[cluster]
303
304
305
def cluster_depths(cluster):
306
    """
307
    Return a dictionary mapping :class:`Tree` instances to their depth.
308
309
    :param Tree cluster: Root cluster
310
    :rtype: class:`dict`
311
312
    """
313
    depths = {}
314
    depths[cluster] = 0
315
    for cluster in preorder(cluster):
316
        cl_depth = depths[cluster]
317
        depths.update(dict.fromkeys(cluster.branches, cl_depth + 1))
318
    return depths
319
320
321
def top_clusters(tree, k):
322
    """
323
    Return `k` topmost clusters from hierarchical clustering.
324
325
    :param Tree root: Root cluster.
326
    :param int k: Number of top clusters.
327
328
    :rtype: list of :class:`Tree` instances
329
    """
330
    def item(node):
331
        return ((node.is_leaf, -node.value.height), node)
332
333
    heap = [item(tree)]
334
335
    while len(heap) < k:
336
        key, cl = heapq.heappop(heap)
337
        if cl.is_leaf:
338
            assert all(n.is_leaf for _, n in heap)
339
            heapq.heappush(heap, (key, cl))
340
            break
341
342
        left, right = cl.left, cl.right
343
        heapq.heappush(heap, item(left))
344
        heapq.heappush(heap, item(right))
345
346
    return [n for _, n in heap]
347
348
349
def optimal_leaf_ordering(tree, distances, progress_callback=None):
350
    """
351
    Order the leaves in the clustering tree.
352
353
    (based on Ziv Bar-Joseph et al. Fast optimal leaf ordering for
354
    hierarchical clustering)
355
356
    :param Tree tree:
357
        Binary hierarchical clustering tree.
358
    :param numpy.ndarray distances:
359
        A (N, N) numpy.ndarray of distances that were used to compute
360
        the clustering.
361
    :param function progress_callback:
362
        Function used to report on progress.
363
364
    """
365
    distances = numpy.asarray(distances)
366
    M = numpy.zeros_like(distances)
367
368
    # rearrange distances by order defined by tree's leaves
369
    indices = numpy.array([leaf.value.index for leaf in leaves(tree)])
370
    distances = distances[indices[numpy.newaxis, :],
371
                          indices[:, numpy.newaxis]]
372
    distances = numpy.ascontiguousarray(distances)
373
374
    # This is the 'fast' early termination search described in the paper
375
    # (it is slower in the pure python implementation)
376
    def argmin_ordered_xpypZ(x, y, Z, sorter_x=None, sorter_y=None):
0 ignored issues
show
The variable argmin_ordered_xpypZ seems to be unused.
Loading history...
377
        C = numpy.min(Z)
378
        if sorter_x is None:
379
            sorter_x = range(len(x))
380
            ordered_x = x
381
        else:
382
            ordered_x = x[sorter_x]
383
        if sorter_y is None:
384
            sorter_y = range(len(y))
385
            ordered_y = y
386
        else:
387
            ordered_y = y[sorter_y]
388
389
        y0 = ordered_y[0]
390
391
        best_val = numpy.inf
392
        best_i, best_j = 0, 0
393
394
        y0pC = y0 + C
395
        for i, x in zip(sorter_x, ordered_x):
396
            if x + y0pC >= best_val:
397
                break
398
            xpC = x + C
399
            for j, y in zip(sorter_y, ordered_y):
400
                if xpC + y >= best_val:
401
                    break
402
                val = x + y + Z[i, j]
403
                if val < best_val:
404
                    best_val, best_i, best_j = val, i, j
405
        return best_i, best_j
406
407
    def argmin_xpypZ(x, y, Z):
408
        _, h = Z.shape
409
        A = Z + numpy.reshape(x, (-1, 1))
410
        A += numpy.reshape(y, (1, -1))
411
        i = numpy.argmin(A)
412
        return i // h, i % h
413
414
    def optimal_ordering(tree, M, ordering):
415
        if tree.is_leaf:
416
            M[tree.value.first, tree.value.first] = 0.0
417
        else:
418
            left, right = tree.left, tree.right
419
            if not left.is_leaf:
420
                V_ll = range(*left.left.value.range)
421
                V_lr = range(*left.right.value.range)
422
                u_iter = chain(((u, V_lr) for u in V_ll),
423
                               ((u, V_ll) for u in V_lr))
424
            else:
425
                V_lr = range(*left.value.range)
426
                u_iter = ((u, V_lr) for u in V_lr)
427
428
            u_iter = list(u_iter)
429
            assert [u for u, _ in u_iter] == list(range(*left.value.range))
430
431
            if not right.is_leaf:
432
                V_rl = range(*right.left.value.range)
433
                V_rr = range(*right.right.value.range)
434
                w_iter = chain(((w, V_rr) for w in V_rl),
435
                               ((w, V_rl) for w in V_rr))
436
            else:
437
                V_rr = range(*right.value.range)
438
                w_iter = ((w, V_rr) for w in V_rr)
439
440
            w_iter = list(w_iter)
441
            assert [w for w, _ in w_iter] == list(range(*right.value.range))
442
443
            for u, left_inner in u_iter:
444
                left_inner_slice = slice(left_inner.start, left_inner.stop)
445
                M_left_inner = M[u, left_inner_slice]
446
#                 left_inner_sort = numpy.argsort(M_left_inner)
447
                for w, right_inner in w_iter:
448
                    right_inner_slice = slice(right_inner.start,
449
                                              right_inner.stop)
450
                    M_right_inner = M[w, right_inner_slice]
451
#                     right_inner_sort = numpy.argsort(M_right_inner)
452
#                     i, j = argmin_ordered_xpypZ(
453
#                         M_left_inner, M_right_inner,
454
#                         distances[left_inner_slice, right_inner_slice],
455
#                         sorter_x=left_inner_sort, sorter_y=right_inner_sort,
456
#                     )
457
                    i, j = argmin_xpypZ(
458
                        M_left_inner, M_right_inner,
459
                        distances[left_inner_slice, right_inner_slice]
460
                    )
461
                    m, k = left_inner.start + i, right_inner.start + j
462
                    score = M[u, m] + M[k, w] + distances[m, k]
463
                    M[u, w] = M[w, u] = score
464
                    ordering[u, w] = (u, m, k, w)
465
466
        return M, ordering
467
468
    subtrees = list(postorder(tree))
469
    ordering = {}
470
471
    for i, subtree in enumerate(subtrees):
472
        M, ordering = optimal_ordering(subtree, M, ordering)
473
474
        if progress_callback:
475
            progress_callback(100.0 * i / len(subtrees))
476
477
    def min_uw(tree, u=None, w=None):
478
        if tree.is_leaf:
479
            return 0.0
480
        else:
481
            if u is None:
482
                U = slice(*tree.left.value.range)
483
            else:
484
                U = slice(u, u + 1)
485
            if w is None:
486
                W = slice(*tree.right.value.range)
487
            else:
488
                W = slice(w, w + 1)
489
490
            M_ = M[U, W]
491
            _, w = M_.shape
492
            i = numpy.argmin(M_.ravel())
493
            i, j = i // w, i % w
494
            return U.start + i, W.start + j
495
496
    def optimal_swap(root, M):
0 ignored issues
show
The argument M seems to be unused.
Loading history...
497
        opt_uw = {root: min_uw(root)}
498
        # run down the tree applying u, w constraints from parents.
499
        for tree in preorder(root):
500
            if tree.is_leaf:
501
                pass
502
            else:
503
                u, w = opt_uw[tree]
504
                assert u in range(*tree.value.range)
505
                assert w in range(*tree.value.range)
506
                if u < w:
507
                    u, m, k, w = ordering[u, w]
508
509
                    opt_uw[tree.left] = (u, m)
510
                    opt_uw[tree.right] = (k, w)
511
                else:
512
                    w, k, m, u = ordering[w, u]
513
                    opt_uw[tree.right] = (u, m)
514
515
                    opt_uw[tree.left] = (k, w)
516
517
        def is_swapped(tree):
518
            "Is `tree` swapped based on computed optimal ordering"
519
            if tree.is_leaf:
520
                return False
521
            else:
522
                u, w = opt_uw[tree]
523
                return u > w
524
525
        def swaped_branches(tree):
526
            "Return branches from `tree` in optimal order"
527
            if tree.is_leaf:
528
                return ()
529
            elif is_swapped(tree):
530
                return tree.branches
531
            else:
532
                return tuple(reversed(tree.branches))
533
534
        # Create a new tree structure with optimally swapped branches.
535
        T = {}
536
        counter = count(0)
537
        for tree in postorder(root, branches=swaped_branches):
538
            if tree.is_leaf:
539
                # we need to 're-enumerate' the leaves
540
                i = next(counter)
541
                T[tree] = Tree(tree.value._replace(range=(i, i + 1)), ())
542
            else:
543
                left, right = T[tree.left], T[tree.right]
544
545
                if left.value.first > right.value.first:
546
                    right, left = left, right
547
548
                assert left.value.first < right.value.last
549
                assert left.value.last == right.value.first
550
551
                T[tree] = Tree(tree.value._replace(range=(left.value.first,
552
                                                          right.value.last)),
553
                               (left, right))
554
        return T[root]
555
556
    return optimal_swap(tree, M)
557
558
559
class HierarchicalClustering:
560
    def __init__(self, n_clusters=2, linkage=AVERAGE):
561
        self.n_clusters = n_clusters
562
        self.linkage = linkage
563
564
    def fit(self, X):
565
        self.tree = dist_matrix_clustering(X, linkage=self.linkage)
0 ignored issues
show
The attribute tree was defined outside __init__.

It is generally a good practice to initialize all attributes to default values in the __init__ method:

class Foo:
    def __init__(self, x=None):
        self.x = x
Loading history...
566
        cut = top_clusters(self.tree, self.n_clusters)
567
        labels = numpy.zeros(self.tree.value.last)
568
569
        for i, cl in enumerate(cut):
570
            indices = [leaf.value.index for leaf in leaves(cl)]
571
            labels[indices] = i
572
573
        self.labels = labels
0 ignored issues
show
The attribute labels was defined outside __init__.

It is generally a good practice to initialize all attributes to default values in the __init__ method:

class Foo:
    def __init__(self, x=None):
        self.x = x
Loading history...
574
575
    def fit_predict(self, X, y=None):
0 ignored issues
show
The argument y seems to be unused.
Loading history...
576
        self.fit(X)
577
        return self.labels
578