Orange.clustering.HierarchicalClustering - Code Metrics - hugobuddel/orange3 - Measure and Improve Code Quality continuously with Scrutinizer

Orange.clustering.HierarchicalClustering A
last analyzed 2015-12-08 12:38 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	19
Duplicated Lines	0 %

Metric	Value
dl	0
loc	19
rs	10
wmc	5

3 Methods

Rating	Name	Size	Complexity
A	fit_predict()	3	1
A	fit()	10	3
A	__init__()	3	1

from collections import namedtuple, deque
from operator import attrgetter
from itertools import chain, count
import heapq
import numpy
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3

import scipy.cluster.hierarchy
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
from Orange.distance import Euclidean, PearsonR

__all__ = ['HierarchicalClustering']

SINGLE = "single"
AVERAGE = "average"
COMPLETE = "complete"
WEIGHTED = "weighted"
WARD = "ward"


def condensedform(X, mode="upper"):
    X = numpy.asarray(X)
    assert len(X.shape) == 2
    assert X.shape[0] == X.shape[1]

    N = X.shape[0]

    if mode == "upper":
        i, j = numpy.triu_indices(N, k=1)
    elif mode == "lower":
        i, j = numpy.tril_indices(N, k=-1)
    else:
        raise ValueError("invalid mode")
    return X[i, j]


def squareform(X, mode="upper"):
    X = numpy.asarray(X)
    k = X.shape[0]
    N = int(numpy.ceil(numpy.sqrt(k * 2)))
    assert N * (N - 1) // 2 == k
    matrix = numpy.zeros((N, N), dtype=X.dtype)
    if mode == "upper":
        i, j = numpy.triu_indices(N, k=1)
        matrix[i, j] = X
        m, n = numpy.tril_indices(N, k=-1)
        matrix[m, n] = matrix.T[m, n]
    elif mode == "lower":
        i, j = numpy.tril_indices(N, k=-1)
        matrix[i, j] = X
        m, n = numpy.triu_indices(N, k=1)
        matrix[m, n] = matrix.T[m, n]
    return matrix


def data_clustering(data, distance=Euclidean,
                    linkage=AVERAGE):
    """
    Return the hierarchical clustering of the data set's rows.

    :param Orange.data.Table data: Data set to cluster.
    :param Orange.distance.Distance distance: A distance measure.
    :param str linkage:
    """
    matrix = distance(data)
    return dist_matrix_clustering(matrix, linkage=linkage)


def feature_clustering(data, distance=PearsonR,
                       linkage=AVERAGE):
    """
    Return the hierarchical clustering of the data set's columns.

    :param Orange.data.Table data: Data set to cluster.
    :param Orange.distance.Distance distance: A distance measure.
    :param str linkage:
    """
    matrix = distance(data, axis=0)
    return dist_matrix_clustering(matrix, linkage=linkage)



def dist_matrix_linkage(matrix, linkage=AVERAGE):
    """
    Return linkage using a precomputed distance matrix.

    :param Orange.misc.DistMatrix matrix:
    :param str linkage:
    """
    # Extract compressed upper triangular distance matrix.
    distances = condensedform(matrix)
    return scipy.cluster.hierarchy.linkage(distances, method=linkage)


def dist_matrix_clustering(matrix, linkage=AVERAGE):
    """
    Return the hierarchical clustering using a precomputed distance matrix.

    :param Orange.misc.DistMatrix matrix:
    :param str linkage:
    """
    # Extract compressed upper triangular distance matrix.
    distances = condensedform(matrix)
    Z = scipy.cluster.hierarchy.linkage(distances, method=linkage)
    return tree_from_linkage(Z)


def sample_clustering(X, linkage=AVERAGE, metric="euclidean"):
    assert len(X.shape) == 2
    Z = scipy.cluster.hierarchy.linkage(X, method=linkage, metric=metric)
    return tree_from_linkage(Z)


Tree = namedtuple("Tree", ["value", "branches"])

ClusterData = namedtuple("Cluster", ["range", "height"])
SingletonData = namedtuple("Singleton", ["range", "height", "index"])


class _Ranged:

    @property
    def first(self):
        return self.range[0]

    @property
    def last(self):
        return self.range[-1]


class ClusterData(ClusterData, _Ranged):

    __slots__ = ()


class SingletonData(SingletonData, _Ranged):

    __slots__ = ()


class Tree(Tree):

    def __new__(cls, value, branches=()):
        if not isinstance(branches, tuple):
            raise TypeError()

        return super().__new__(cls, value, branches)

    @property
    def is_leaf(self):
        return not bool(self.branches)

    @property
    def left(self):
        return self.branches[0]

    @property
    def right(self):
        return self.branches[-1]


def tree_from_linkage(linkage):
    """
    Return a Tree representation of a clustering encoded in a linkage matrix.

    .. seealso:: scipy.cluster.hierarchy.linkage

    """
    T = {}
    N, _ = linkage.shape
    N = N + 1
    order = []
    for i, (c1, c2, d, _) in enumerate(linkage):
        if c1 < N:
            left = Tree(SingletonData(range=(len(order), len(order) + 1),
                                      height=0.0, index=int(c1)),
                        ())
            order.append(c1)
        else:
            left = T[c1]

        if c2 < N:
            right = Tree(SingletonData(range=(len(order), len(order) + 1),
                                       height=0.0, index=int(c2)),
                         ())
            order.append(c2)
        else:
            right = T[c2]

        t = Tree(ClusterData(range=(left.value.first, right.value.last),
                             height=d),
                 (left, right))
        T[N + i] = t

    root = T[N + N - 2]
    T = {}

    leaf_idx = 0
    for node in postorder(root):
        if node.is_leaf:
            T[node] = Tree(
                node.value._replace(range=(leaf_idx, leaf_idx + 1)), ())
            leaf_idx += 1
        else:
            left, right = T[node.left].value, T[node.right].value
            assert left.first < right.first

            t = Tree(
                node.value._replace(range=(left.range[0], right.range[1])),
                tuple(T[ch] for ch in node.branches)
            )
            assert t.value.range[0] <= t.value.range[-1]
            assert left.first == t.value.first and right.last == t.value.last
            assert t.value.first < right.first
            assert t.value.last > left.last
            T[node] = t

    return T[root]


def postorder(tree, branches=attrgetter("branches")):
    stack = deque([tree])
    visited = set()

    while stack:
        current = stack.popleft()
        children = branches(current)
        if children:
            # yield the item on the way up
            if current in visited:
                yield current
            else:
                # stack = children + [current] + stack
                stack.extendleft([current])
                stack.extendleft(reversed(children))
                visited.add(current)

        else:
            yield current
            visited.add(current)


def preorder(tree, branches=attrgetter("branches")):
    stack = deque([tree])
    while stack:
        current = stack.popleft()
        yield current
        children = branches(current)
        if children:
            stack.extendleft(reversed(children))


def leaves(tree, branches=attrgetter("branches")):
    """
    Return an iterator over the leaf nodes in a tree structure.
    """
    return (node for node in postorder(tree, branches)
            if node.is_leaf)


def prune(cluster, level=None, height=None, condition=None):
    """
    Prune the clustering instance ``cluster``.

    :param Tree cluster: Cluster root node to prune.
    :param int level: If not `None` prune all clusters deeper then `level`.
    :param float height:
        If not `None` prune all clusters with height lower then `height`.
    :param function condition:
        If not `None condition must be a `Tree -> bool` function
        evaluating to `True` if the cluster should be pruned.

    .. note::
        At least one `level`, `height` or `condition` argument needs to
        be supplied.

    """
    if not any(arg is not None for arg in [level, height, condition]):
        raise ValueError("At least one pruning argument must be supplied")

    level_check = height_check = condition_check = lambda cl: False

    if level is not None:
        cluster_depth = cluster_depths(cluster)
        level_check = lambda cl: cluster_depth[cl] >= level

    if height is not None:
        height_check = lambda cl: cl.value.height <= height

    if condition is not None:
        condition_check = condition

    def check_all(cl):
        return level_check(cl) or height_check(cl) or condition_check(cl)

    T = {}

    for node in postorder(cluster):
        if check_all(node):
            if node.is_leaf:
                T[node] = node
            else:
                T[node] = Tree(node.value, ())
        else:
            T[node] = Tree(node.value,
                           tuple(T[ch] for ch in node.branches))
    return T[cluster]


def cluster_depths(cluster):
    """
    Return a dictionary mapping :class:`Tree` instances to their depth.

    :param Tree cluster: Root cluster
    :rtype: class:`dict`

    """
    depths = {}
    depths[cluster] = 0
    for cluster in preorder(cluster):
        cl_depth = depths[cluster]
        depths.update(dict.fromkeys(cluster.branches, cl_depth + 1))
    return depths


def top_clusters(tree, k):
    """
    Return `k` topmost clusters from hierarchical clustering.

    :param Tree root: Root cluster.
    :param int k: Number of top clusters.

    :rtype: list of :class:`Tree` instances
    """
    def item(node):
        return ((node.is_leaf, -node.value.height), node)

    heap = [item(tree)]

    while len(heap) < k:
        key, cl = heapq.heappop(heap)
        if cl.is_leaf:
            assert all(n.is_leaf for _, n in heap)
            heapq.heappush(heap, (key, cl))
            break

        left, right = cl.left, cl.right
        heapq.heappush(heap, item(left))
        heapq.heappush(heap, item(right))

    return [n for _, n in heap]


def optimal_leaf_ordering(tree, distances, progress_callback=None):
    """
    Order the leaves in the clustering tree.

    (based on Ziv Bar-Joseph et al. Fast optimal leaf ordering for
    hierarchical clustering)

    :param Tree tree:
        Binary hierarchical clustering tree.
    :param numpy.ndarray distances:
        A (N, N) numpy.ndarray of distances that were used to compute
        the clustering.
    :param function progress_callback:
        Function used to report on progress.

    """
    distances = numpy.asarray(distances)
    M = numpy.zeros_like(distances)

    # rearrange distances by order defined by tree's leaves
    indices = numpy.array([leaf.value.index for leaf in leaves(tree)])
    distances = distances[indices[numpy.newaxis, :],
                          indices[:, numpy.newaxis]]
    distances = numpy.ascontiguousarray(distances)

    # This is the 'fast' early termination search described in the paper
    # (it is slower in the pure python implementation)
    def argmin_ordered_xpypZ(x, y, Z, sorter_x=None, sorter_y=None):

        C = numpy.min(Z)
        if sorter_x is None:
            sorter_x = range(len(x))
            ordered_x = x
        else:
            ordered_x = x[sorter_x]
        if sorter_y is None:
            sorter_y = range(len(y))
            ordered_y = y
        else:
            ordered_y = y[sorter_y]

        y0 = ordered_y[0]

        best_val = numpy.inf
        best_i, best_j = 0, 0

        y0pC = y0 + C
        for i, x in zip(sorter_x, ordered_x):
            if x + y0pC >= best_val:
                break
            xpC = x + C
            for j, y in zip(sorter_y, ordered_y):
                if xpC + y >= best_val:
                    break
                val = x + y + Z[i, j]
                if val < best_val:
                    best_val, best_i, best_j = val, i, j
        return best_i, best_j

    def argmin_xpypZ(x, y, Z):
        _, h = Z.shape
        A = Z + numpy.reshape(x, (-1, 1))
        A += numpy.reshape(y, (1, -1))
        i = numpy.argmin(A)
        return i // h, i % h

    def optimal_ordering(tree, M, ordering):
        if tree.is_leaf:
            M[tree.value.first, tree.value.first] = 0.0
        else:
            left, right = tree.left, tree.right
            if not left.is_leaf:
                V_ll = range(*left.left.value.range)
                V_lr = range(*left.right.value.range)
                u_iter = chain(((u, V_lr) for u in V_ll),
                               ((u, V_ll) for u in V_lr))
            else:
                V_lr = range(*left.value.range)
                u_iter = ((u, V_lr) for u in V_lr)

            u_iter = list(u_iter)
            assert [u for u, _ in u_iter] == list(range(*left.value.range))

            if not right.is_leaf:
                V_rl = range(*right.left.value.range)
                V_rr = range(*right.right.value.range)
                w_iter = chain(((w, V_rr) for w in V_rl),
                               ((w, V_rl) for w in V_rr))
            else:
                V_rr = range(*right.value.range)
                w_iter = ((w, V_rr) for w in V_rr)

            w_iter = list(w_iter)
            assert [w for w, _ in w_iter] == list(range(*right.value.range))

            for u, left_inner in u_iter:
                left_inner_slice = slice(left_inner.start, left_inner.stop)
                M_left_inner = M[u, left_inner_slice]
#                 left_inner_sort = numpy.argsort(M_left_inner)
                for w, right_inner in w_iter:
                    right_inner_slice = slice(right_inner.start,
                                              right_inner.stop)
                    M_right_inner = M[w, right_inner_slice]
#                     right_inner_sort = numpy.argsort(M_right_inner)
#                     i, j = argmin_ordered_xpypZ(
#                         M_left_inner, M_right_inner,
#                         distances[left_inner_slice, right_inner_slice],
#                         sorter_x=left_inner_sort, sorter_y=right_inner_sort,
#                     )
                    i, j = argmin_xpypZ(
                        M_left_inner, M_right_inner,
                        distances[left_inner_slice, right_inner_slice]
                    )
                    m, k = left_inner.start + i, right_inner.start + j
                    score = M[u, m] + M[k, w] + distances[m, k]
                    M[u, w] = M[w, u] = score
                    ordering[u, w] = (u, m, k, w)

        return M, ordering

    subtrees = list(postorder(tree))
    ordering = {}

    for i, subtree in enumerate(subtrees):
        M, ordering = optimal_ordering(subtree, M, ordering)

        if progress_callback:
            progress_callback(100.0 * i / len(subtrees))

    def min_uw(tree, u=None, w=None):
        if tree.is_leaf:
            return 0.0
        else:
            if u is None:
                U = slice(*tree.left.value.range)
            else:
                U = slice(u, u + 1)
            if w is None:
                W = slice(*tree.right.value.range)
            else:
                W = slice(w, w + 1)

            M_ = M[U, W]
            _, w = M_.shape
            i = numpy.argmin(M_.ravel())
            i, j = i // w, i % w
            return U.start + i, W.start + j

    def optimal_swap(root, M):

        opt_uw = {root: min_uw(root)}
        # run down the tree applying u, w constraints from parents.
        for tree in preorder(root):
            if tree.is_leaf:
                pass
            else:
                u, w = opt_uw[tree]
                assert u in range(*tree.value.range)
                assert w in range(*tree.value.range)
                if u < w:
                    u, m, k, w = ordering[u, w]

                    opt_uw[tree.left] = (u, m)
                    opt_uw[tree.right] = (k, w)
                else:
                    w, k, m, u = ordering[w, u]
                    opt_uw[tree.right] = (u, m)

                    opt_uw[tree.left] = (k, w)

        def is_swapped(tree):
            "Is `tree` swapped based on computed optimal ordering"
            if tree.is_leaf:
                return False
            else:
                u, w = opt_uw[tree]
                return u > w

        def swaped_branches(tree):
            "Return branches from `tree` in optimal order"
            if tree.is_leaf:
                return ()
            elif is_swapped(tree):
                return tree.branches
            else:
                return tuple(reversed(tree.branches))

        # Create a new tree structure with optimally swapped branches.
        T = {}
        counter = count(0)
        for tree in postorder(root, branches=swaped_branches):
            if tree.is_leaf:
                # we need to 're-enumerate' the leaves
                i = next(counter)
                T[tree] = Tree(tree.value._replace(range=(i, i + 1)), ())
            else:
                left, right = T[tree.left], T[tree.right]

                if left.value.first > right.value.first:
                    right, left = left, right

                assert left.value.first < right.value.last
                assert left.value.last == right.value.first

                T[tree] = Tree(tree.value._replace(range=(left.value.first,
                                                          right.value.last)),
                               (left, right))
        return T[root]

    return optimal_swap(tree, M)


class HierarchicalClustering:
    def __init__(self, n_clusters=2, linkage=AVERAGE):
        self.n_clusters = n_clusters
        self.linkage = linkage

    def fit(self, X):
        self.tree = dist_matrix_clustering(X, linkage=self.linkage)
class Foo:
    def __init__(self, x=None):
        self.x = x
        cut = top_clusters(self.tree, self.n_clusters)
        labels = numpy.zeros(self.tree.value.last)

        for i, cl in enumerate(cut):
            indices = [leaf.value.index for leaf in leaves(cl)]
            labels[indices] = i

        self.labels = labels
class Foo:
    def __init__(self, x=None):
        self.x = x

    def fit_predict(self, X, y=None):

        self.fit(X)
        return self.labels


GitHub Access Token became invalid

Orange.clustering.HierarchicalClustering A
last analyzed 2015-12-08 12:38 UTC

Complexity

Size/Duplication

3 Methods

1. Missing Dependencies

2. Missing init.py files

1. Missing Dependencies

2. Missing init.py files

1			from collections import namedtuple, deque
2			from operator import attrgetter
3			from itertools import chain, count
4			import heapq
5			import numpy
			0 ignored issues – show Configuration introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report The import `numpy` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
6
7			import scipy.cluster.hierarchy
			0 ignored issues – show Configuration introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report The import `scipy.cluster.hierarchy` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
8			from Orange.distance import Euclidean, PearsonR
9
10			__all__ = ['HierarchicalClustering']
11
12			SINGLE = "single"
13			AVERAGE = "average"
14			COMPLETE = "complete"
15			WEIGHTED = "weighted"
16			WARD = "ward"
17
18
19			def condensedform(X, mode="upper"):
20			X = numpy.asarray(X)
21			assert len(X.shape) == 2
22			assert X.shape[0] == X.shape[1]
23
24			N = X.shape[0]
25
26			if mode == "upper":
27			i, j = numpy.triu_indices(N, k=1)
28			elif mode == "lower":
29			i, j = numpy.tril_indices(N, k=-1)
30			else:
31			raise ValueError("invalid mode")
32			return X[i, j]
33
34
35			def squareform(X, mode="upper"):
36			X = numpy.asarray(X)
37			k = X.shape[0]
38			N = int(numpy.ceil(numpy.sqrt(k * 2)))
39			assert N * (N - 1) // 2 == k
40			matrix = numpy.zeros((N, N), dtype=X.dtype)
41			if mode == "upper":
42			i, j = numpy.triu_indices(N, k=1)
43			matrix[i, j] = X
44			m, n = numpy.tril_indices(N, k=-1)
45			matrix[m, n] = matrix.T[m, n]
46			elif mode == "lower":
47			i, j = numpy.tril_indices(N, k=-1)
48			matrix[i, j] = X
49			m, n = numpy.triu_indices(N, k=1)
50			matrix[m, n] = matrix.T[m, n]
51			return matrix
52
53
54			def data_clustering(data, distance=Euclidean,
55			linkage=AVERAGE):
56			"""
57			Return the hierarchical clustering of the data set's rows.
58
59			:param Orange.data.Table data: Data set to cluster.
60			:param Orange.distance.Distance distance: A distance measure.
61			:param str linkage:
62			"""
63			matrix = distance(data)
64			return dist_matrix_clustering(matrix, linkage=linkage)
65
66
67			def feature_clustering(data, distance=PearsonR,
68			linkage=AVERAGE):
69			"""
70			Return the hierarchical clustering of the data set's columns.
71
72			:param Orange.data.Table data: Data set to cluster.
73			:param Orange.distance.Distance distance: A distance measure.
74			:param str linkage:
75			"""
76			matrix = distance(data, axis=0)
77			return dist_matrix_clustering(matrix, linkage=linkage)
78
79
80
81			def dist_matrix_linkage(matrix, linkage=AVERAGE):
82			"""
83			Return linkage using a precomputed distance matrix.
84
85			:param Orange.misc.DistMatrix matrix:
86			:param str linkage:
87			"""
88			# Extract compressed upper triangular distance matrix.
89			distances = condensedform(matrix)
90			return scipy.cluster.hierarchy.linkage(distances, method=linkage)
91
92
93			def dist_matrix_clustering(matrix, linkage=AVERAGE):
94			"""
95			Return the hierarchical clustering using a precomputed distance matrix.
96
97			:param Orange.misc.DistMatrix matrix:
98			:param str linkage:
99			"""
100			# Extract compressed upper triangular distance matrix.
101			distances = condensedform(matrix)
102			Z = scipy.cluster.hierarchy.linkage(distances, method=linkage)
103			return tree_from_linkage(Z)
104
105
106			def sample_clustering(X, linkage=AVERAGE, metric="euclidean"):
107			assert len(X.shape) == 2
108			Z = scipy.cluster.hierarchy.linkage(X, method=linkage, metric=metric)
109			return tree_from_linkage(Z)
110
111
112			Tree = namedtuple("Tree", ["value", "branches"])
113
114			ClusterData = namedtuple("Cluster", ["range", "height"])
115			SingletonData = namedtuple("Singleton", ["range", "height", "index"])
116
117
118			class _Ranged:
119
120			@property
121			def first(self):
122			return self.range[0]
123
124			@property
125			def last(self):
126			return self.range[-1]
127
128
129			class ClusterData(ClusterData, _Ranged):
			0 ignored issues – show Bug introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report class already defined line 114 Loading history...
130			__slots__ = ()
131
132
133			class SingletonData(SingletonData, _Ranged):
			0 ignored issues – show Bug introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report class already defined line 115 Loading history...
134			__slots__ = ()
135
136
137			class Tree(Tree):
			0 ignored issues – show Bug introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report class already defined line 112 Loading history...
138			def __new__(cls, value, branches=()):
139			if not isinstance(branches, tuple):
140			raise TypeError()
141
142			return super().__new__(cls, value, branches)
143
144			@property
145			def is_leaf(self):
146			return not bool(self.branches)
147
148			@property
149			def left(self):
150			return self.branches[0]
151
152			@property
153			def right(self):
154			return self.branches[-1]
155
156
157			def tree_from_linkage(linkage):
158			"""
159			Return a Tree representation of a clustering encoded in a linkage matrix.
160
161			.. seealso:: scipy.cluster.hierarchy.linkage
162
163			"""
164			T = {}
165			N, _ = linkage.shape
166			N = N + 1
167			order = []
168			for i, (c1, c2, d, _) in enumerate(linkage):
169			if c1 < N:
170			left = Tree(SingletonData(range=(len(order), len(order) + 1),
171			height=0.0, index=int(c1)),
172			())
173			order.append(c1)
174			else:
175			left = T[c1]
176
177			if c2 < N:
178			right = Tree(SingletonData(range=(len(order), len(order) + 1),
179			height=0.0, index=int(c2)),
180			())
181			order.append(c2)
182			else:
183			right = T[c2]
184
185			t = Tree(ClusterData(range=(left.value.first, right.value.last),
186			height=d),
187			(left, right))
188			T[N + i] = t
189
190			root = T[N + N - 2]
191			T = {}
192
193			leaf_idx = 0
194			for node in postorder(root):
195			if node.is_leaf:
196			T[node] = Tree(
197			node.value._replace(range=(leaf_idx, leaf_idx + 1)), ())
198			leaf_idx += 1
199			else:
200			left, right = T[node.left].value, T[node.right].value
201			assert left.first < right.first
202
203			t = Tree(
204			node.value._replace(range=(left.range[0], right.range[1])),
205			tuple(T[ch] for ch in node.branches)
206			)
207			assert t.value.range[0] <= t.value.range[-1]
208			assert left.first == t.value.first and right.last == t.value.last
209			assert t.value.first < right.first
210			assert t.value.last > left.last
211			T[node] = t
212
213			return T[root]
214
215
216			def postorder(tree, branches=attrgetter("branches")):
217			stack = deque([tree])
218			visited = set()
219
220			while stack:
221			current = stack.popleft()
222			children = branches(current)
223			if children:
224			# yield the item on the way up
225			if current in visited:
226			yield current
227			else:
228			# stack = children + [current] + stack
229			stack.extendleft([current])
230			stack.extendleft(reversed(children))
231			visited.add(current)
232
233			else:
234			yield current
235			visited.add(current)
236
237
238			def preorder(tree, branches=attrgetter("branches")):
239			stack = deque([tree])
240			while stack:
241			current = stack.popleft()
242			yield current
243			children = branches(current)
244			if children:
245			stack.extendleft(reversed(children))
246
247
248			def leaves(tree, branches=attrgetter("branches")):
249			"""
250			Return an iterator over the leaf nodes in a tree structure.
251			"""
252			return (node for node in postorder(tree, branches)
253			if node.is_leaf)
254
255
256			def prune(cluster, level=None, height=None, condition=None):
257			"""
258			Prune the clustering instance ``cluster``.
259
260			:param Tree cluster: Cluster root node to prune.
261			:param int level: If not `None` prune all clusters deeper then `level`.
262			:param float height:
263			If not `None` prune all clusters with height lower then `height`.
264			:param function condition:
265			If not `None condition must be a `Tree -> bool` function
266			evaluating to `True` if the cluster should be pruned.
267
268			.. note::
269			At least one `level`, `height` or `condition` argument needs to
270			be supplied.
271
272			"""
273			if not any(arg is not None for arg in [level, height, condition]):
274			raise ValueError("At least one pruning argument must be supplied")
275
276			level_check = height_check = condition_check = lambda cl: False
277
278			if level is not None:
279			cluster_depth = cluster_depths(cluster)
280			level_check = lambda cl: cluster_depth[cl] >= level
281
282			if height is not None:
283			height_check = lambda cl: cl.value.height <= height
284
285			if condition is not None:
286			condition_check = condition
287
288			def check_all(cl):
289			return level_check(cl) or height_check(cl) or condition_check(cl)
290
291			T = {}
292
293			for node in postorder(cluster):
294			if check_all(node):
295			if node.is_leaf:
296			T[node] = node
297			else:
298			T[node] = Tree(node.value, ())
299			else:
300			T[node] = Tree(node.value,
301			tuple(T[ch] for ch in node.branches))
302			return T[cluster]
303
304
305			def cluster_depths(cluster):
306			"""
307			Return a dictionary mapping :class:`Tree` instances to their depth.
308
309			:param Tree cluster: Root cluster
310			:rtype: class:`dict`
311
312			"""
313			depths = {}
314			depths[cluster] = 0
315			for cluster in preorder(cluster):
316			cl_depth = depths[cluster]
317			depths.update(dict.fromkeys(cluster.branches, cl_depth + 1))
318			return depths
319
320
321			def top_clusters(tree, k):
322			"""
323			Return `k` topmost clusters from hierarchical clustering.
324
325			:param Tree root: Root cluster.
326			:param int k: Number of top clusters.
327
328			:rtype: list of :class:`Tree` instances
329			"""
330			def item(node):
331			return ((node.is_leaf, -node.value.height), node)
332
333			heap = [item(tree)]
334
335			while len(heap) < k:
336			key, cl = heapq.heappop(heap)
337			if cl.is_leaf:
338			assert all(n.is_leaf for _, n in heap)
339			heapq.heappush(heap, (key, cl))
340			break
341
342			left, right = cl.left, cl.right
343			heapq.heappush(heap, item(left))
344			heapq.heappush(heap, item(right))
345
346			return [n for _, n in heap]
347
348
349			def optimal_leaf_ordering(tree, distances, progress_callback=None):
350			"""
351			Order the leaves in the clustering tree.
352
353			(based on Ziv Bar-Joseph et al. Fast optimal leaf ordering for
354			hierarchical clustering)
355
356			:param Tree tree:
357			Binary hierarchical clustering tree.
358			:param numpy.ndarray distances:
359			A (N, N) numpy.ndarray of distances that were used to compute
360			the clustering.
361			:param function progress_callback:
362			Function used to report on progress.
363
364			"""
365			distances = numpy.asarray(distances)
366			M = numpy.zeros_like(distances)
367
368			# rearrange distances by order defined by tree's leaves
369			indices = numpy.array([leaf.value.index for leaf in leaves(tree)])
370			distances = distances[indices[numpy.newaxis, :],
371			indices[:, numpy.newaxis]]
372			distances = numpy.ascontiguousarray(distances)
373
374			# This is the 'fast' early termination search described in the paper
375			# (it is slower in the pure python implementation)
376			def argmin_ordered_xpypZ(x, y, Z, sorter_x=None, sorter_y=None):
			0 ignored issues – show Unused Code introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report The variable `argmin_ordered_xpypZ` seems to be unused. Loading history...
377			C = numpy.min(Z)
378			if sorter_x is None:
379			sorter_x = range(len(x))
380			ordered_x = x
381			else:
382			ordered_x = x[sorter_x]
383			if sorter_y is None:
384			sorter_y = range(len(y))
385			ordered_y = y
386			else:
387			ordered_y = y[sorter_y]
388
389			y0 = ordered_y[0]
390
391			best_val = numpy.inf
392			best_i, best_j = 0, 0
393
394			y0pC = y0 + C
395			for i, x in zip(sorter_x, ordered_x):
396			if x + y0pC >= best_val:
397			break
398			xpC = x + C
399			for j, y in zip(sorter_y, ordered_y):
400			if xpC + y >= best_val:
401			break
402			val = x + y + Z[i, j]
403			if val < best_val:
404			best_val, best_i, best_j = val, i, j
405			return best_i, best_j
406
407			def argmin_xpypZ(x, y, Z):
408			_, h = Z.shape
409			A = Z + numpy.reshape(x, (-1, 1))
410			A += numpy.reshape(y, (1, -1))
411			i = numpy.argmin(A)
412			return i // h, i % h
413
414			def optimal_ordering(tree, M, ordering):
415			if tree.is_leaf:
416			M[tree.value.first, tree.value.first] = 0.0
417			else:
418			left, right = tree.left, tree.right
419			if not left.is_leaf:
420			V_ll = range(*left.left.value.range)
421			V_lr = range(*left.right.value.range)
422			u_iter = chain(((u, V_lr) for u in V_ll),
423			((u, V_ll) for u in V_lr))
424			else:
425			V_lr = range(*left.value.range)
426			u_iter = ((u, V_lr) for u in V_lr)
427
428			u_iter = list(u_iter)
429			assert [u for u, _ in u_iter] == list(range(*left.value.range))
430
431			if not right.is_leaf:
432			V_rl = range(*right.left.value.range)
433			V_rr = range(*right.right.value.range)
434			w_iter = chain(((w, V_rr) for w in V_rl),
435			((w, V_rl) for w in V_rr))
436			else:
437			V_rr = range(*right.value.range)
438			w_iter = ((w, V_rr) for w in V_rr)
439
440			w_iter = list(w_iter)
441			assert [w for w, _ in w_iter] == list(range(*right.value.range))
442
443			for u, left_inner in u_iter:
444			left_inner_slice = slice(left_inner.start, left_inner.stop)
445			M_left_inner = M[u, left_inner_slice]
446			# left_inner_sort = numpy.argsort(M_left_inner)
447			for w, right_inner in w_iter:
448			right_inner_slice = slice(right_inner.start,
449			right_inner.stop)
450			M_right_inner = M[w, right_inner_slice]
451			# right_inner_sort = numpy.argsort(M_right_inner)
452			# i, j = argmin_ordered_xpypZ(
453			# M_left_inner, M_right_inner,
454			# distances[left_inner_slice, right_inner_slice],
455			# sorter_x=left_inner_sort, sorter_y=right_inner_sort,
456			# )
457			i, j = argmin_xpypZ(
458			M_left_inner, M_right_inner,
459			distances[left_inner_slice, right_inner_slice]
460			)
461			m, k = left_inner.start + i, right_inner.start + j
462			score = M[u, m] + M[k, w] + distances[m, k]
463			M[u, w] = M[w, u] = score
464			ordering[u, w] = (u, m, k, w)
465
466			return M, ordering
467
468			subtrees = list(postorder(tree))
469			ordering = {}
470
471			for i, subtree in enumerate(subtrees):
472			M, ordering = optimal_ordering(subtree, M, ordering)
473
474			if progress_callback:
475			progress_callback(100.0 * i / len(subtrees))
476
477			def min_uw(tree, u=None, w=None):
478			if tree.is_leaf:
479			return 0.0
480			else:
481			if u is None:
482			U = slice(*tree.left.value.range)
483			else:
484			U = slice(u, u + 1)
485			if w is None:
486			W = slice(*tree.right.value.range)
487			else:
488			W = slice(w, w + 1)
489
490			M_ = M[U, W]
491			_, w = M_.shape
492			i = numpy.argmin(M_.ravel())
493			i, j = i // w, i % w
494			return U.start + i, W.start + j
495
496			def optimal_swap(root, M):
			0 ignored issues – show Unused Code introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report The argument `M` seems to be unused. Loading history...
497			opt_uw = {root: min_uw(root)}
498			# run down the tree applying u, w constraints from parents.
499			for tree in preorder(root):
500			if tree.is_leaf:
501			pass
502			else:
503			u, w = opt_uw[tree]
504			assert u in range(*tree.value.range)
505			assert w in range(*tree.value.range)
506			if u < w:
507			u, m, k, w = ordering[u, w]
508
509			opt_uw[tree.left] = (u, m)
510			opt_uw[tree.right] = (k, w)
511			else:
512			w, k, m, u = ordering[w, u]
513			opt_uw[tree.right] = (u, m)
514
515			opt_uw[tree.left] = (k, w)
516
517			def is_swapped(tree):
518			"Is `tree` swapped based on computed optimal ordering"
519			if tree.is_leaf:
520			return False
521			else:
522			u, w = opt_uw[tree]
523			return u > w
524
525			def swaped_branches(tree):
526			"Return branches from `tree` in optimal order"
527			if tree.is_leaf:
528			return ()
529			elif is_swapped(tree):
530			return tree.branches
531			else:
532			return tuple(reversed(tree.branches))
533
534			# Create a new tree structure with optimally swapped branches.
535			T = {}
536			counter = count(0)
537			for tree in postorder(root, branches=swaped_branches):
538			if tree.is_leaf:
539			# we need to 're-enumerate' the leaves
540			i = next(counter)
541			T[tree] = Tree(tree.value._replace(range=(i, i + 1)), ())
542			else:
543			left, right = T[tree.left], T[tree.right]
544
545			if left.value.first > right.value.first:
546			right, left = left, right
547
548			assert left.value.first < right.value.last
549			assert left.value.last == right.value.first
550
551			T[tree] = Tree(tree.value._replace(range=(left.value.first,
552			right.value.last)),
553			(left, right))
554			return T[root]
555
556			return optimal_swap(tree, M)
557
558
559			class HierarchicalClustering:
560			def __init__(self, n_clusters=2, linkage=AVERAGE):
561			self.n_clusters = n_clusters
562			self.linkage = linkage
563
564			def fit(self, X):
565			self.tree = dist_matrix_clustering(X, linkage=self.linkage)
			0 ignored issues – show Coding Style introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report The attribute `tree` was defined outside `__init__`. It is generally a good practice to initialize all attributes to default values in the `__init__` method: class Foo: def __init__(self, x=None): self.x = x Loading history...
566			cut = top_clusters(self.tree, self.n_clusters)
567			labels = numpy.zeros(self.tree.value.last)
568
569			for i, cl in enumerate(cut):
570			indices = [leaf.value.index for leaf in leaves(cl)]
571			labels[indices] = i
572
573			self.labels = labels
			0 ignored issues – show Coding Style introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report The attribute `labels` was defined outside `__init__`. It is generally a good practice to initialize all attributes to default values in the `__init__` method: class Foo: def __init__(self, x=None): self.x = x Loading history...
574
575			def fit_predict(self, X, y=None):
			0 ignored issues – show Unused Code introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report The argument `y` seems to be unused. Loading history...
576			self.fit(X)
577			return self.labels
578

hugobuddel / orange3

GitHub Access Token became invalid

Orange.clustering.HierarchicalClustering A last analyzed 2015-12-08 12:38 UTC

Complexity

Size/Duplication

3 Methods

1. Missing Dependencies

2. Missing __init__.py files

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

Orange.clustering.HierarchicalClustering A
last analyzed 2015-12-08 12:38 UTC

2. Missing init.py files

2. Missing init.py files