|
1
|
|
|
# import attr |
|
2
|
|
|
|
|
3
|
|
|
# from .computing import ClusterDistroComputer |
|
4
|
|
|
|
|
5
|
|
|
# @attr.s |
|
6
|
|
|
# class BaseClustering: |
|
7
|
|
|
# """Items grouped in clusters/subgroups (eg based on proximity, similarity)""" |
|
8
|
|
|
# clusters = attr.ib(init=True) |
|
9
|
|
|
# id = attr.ib(init=True) |
|
10
|
|
|
|
|
11
|
|
|
# def __iter__(self): |
|
12
|
|
|
# return iter([(cluster.id, cluster) for cluster in self.clusters]) |
|
13
|
|
|
|
|
14
|
|
|
# def __len__(self): |
|
15
|
|
|
# return len(self.clusters) |
|
16
|
|
|
|
|
17
|
|
|
# def __getitem__(self, item): |
|
18
|
|
|
# return self.clusters[item] |
|
19
|
|
|
|
|
20
|
|
|
# def members_n_assigned_clusters(self): |
|
21
|
|
|
# """Generate tuples of cluster members and their assigned cluster index""" |
|
22
|
|
|
# for i, cl in enumerate(self.clusters): |
|
23
|
|
|
# for member_id in iter(cl): |
|
24
|
|
|
# yield member_id, i |
|
25
|
|
|
|
|
26
|
|
|
|
|
27
|
|
|
# @attr.s |
|
28
|
|
|
# class DatapointsCluster(BaseClustering): |
|
29
|
|
|
# """ |
|
30
|
|
|
# Provide a method to return datapoints from a cluster and a method to get the values of all datapoints for a given attribute. |
|
31
|
|
|
# """ |
|
32
|
|
|
# datapoints_extractor = attr.ib(init=True) # call(cluster) -> datapoints |
|
33
|
|
|
# attributes_extractor = attr.ib(init=True) # call(datapoints, attribute) -> attribute_value per datapoint iterable |
|
34
|
|
|
|
|
35
|
|
|
# distro_computer = attr.ib(init=False, default=attr.Factory(lambda self: ClusterDistroComputer.from_extractors( |
|
36
|
|
|
# self.datapoints_extractor, self.attributes_extractor), takes_self=True)) |
|
37
|
|
|
# members = attr.ib(init=False, default={}) # structure to use to cache found items in the clustering, so that we do not seek them next time |
|
38
|
|
|
|
|
39
|
|
|
|
|
40
|
|
|
# @attr.s |
|
41
|
|
|
# class ReportingClustering(DatapointsCluster): |
|
42
|
|
|
# """ |
|
43
|
|
|
# An instance of this class encapsulates the behaviour of a clustering; a set of clusters estimated on some data |
|
44
|
|
|
# """ |
|
45
|
|
|
|
|
46
|
|
|
# pre = 2 |
|
47
|
|
|
# def __str__(self): |
|
48
|
|
|
# body, max_lens = self._get_rows(threshold=10, prob_precision=pre) |
|
49
|
|
|
# header = self._get_header(max_lens, pre, [_ for _ in range(len(self))]) |
|
50
|
|
|
# return header + body |
|
51
|
|
|
|
|
52
|
|
|
# def cluster_of(self, item): |
|
53
|
|
|
# h = hash(item) |
|
54
|
|
|
# return self.members.get(h, self._find_cluster(h)) |
|
55
|
|
|
|
|
56
|
|
|
# def _find_cluster(self, item): |
|
57
|
|
|
# """Call this method to seek through the clusters for the given item.""" |
|
58
|
|
|
# for cluster in self: |
|
59
|
|
|
# if item in cluster.members: |
|
60
|
|
|
# self.members[item] = cluster.id |
|
61
|
|
|
# return self.members[item] |
|
62
|
|
|
|
|
63
|
|
|
# def gen_clusters(self, selected): |
|
64
|
|
|
# """ |
|
65
|
|
|
# Generates Cluster objects according to the indices in the selected clusters list |
|
66
|
|
|
# :param selected: the indices of the clusters to select |
|
67
|
|
|
# :type selected: list |
|
68
|
|
|
# :return: the generated cluster |
|
69
|
|
|
# :rtype: Cluster |
|
70
|
|
|
# """ |
|
71
|
|
|
# for i in selected: |
|
72
|
|
|
# yield self[i] |
|
73
|
|
|
|
|
74
|
|
|
# def get_closest(self, an_id, n, metric='euclidean'): |
|
75
|
|
|
# """Call this method to find n closest vectors (within the same cluster) to the vector corresponding to the input id.""" |
|
76
|
|
|
# return sorted((map(lambda x: distance(self.id2vec[an_id], x, metric=metric), |
|
77
|
|
|
# [_ for _ in self[self.find_cluster(an_id)]])), reverse=True)[:n] |
|
78
|
|
|
|
|
79
|
|
|
# def compute_stats1(self, cluster, attributes): |
|
80
|
|
|
# self._stats = self.distro_computer(cluster, attributes) |
|
81
|
|
|
|
|
82
|
|
|
# def print_clusters(self, selected_clusters='all', threshold=10, prec=2): |
|
83
|
|
|
# if selected_clusters == 'all': |
|
84
|
|
|
# selected_clusters = range(len(self)) |
|
85
|
|
|
# body, max_lens = self._get_rows(threshold=threshold, prob_precision=prec) |
|
86
|
|
|
# header = self._get_header(max_lens, prec, selected_clusters) |
|
87
|
|
|
# # header = ' - '.join('id:{} len:{}'.format(i, len(self[i])) + ' ' * (3-9 + prec + max_lens[i] - len(str(len(self[i])))) for i in selected_clusters) + '\n' |
|
88
|
|
|
# print(header + body) |
|
89
|
|
|
|
|
90
|
|
|
# def print_map(self): |
|
91
|
|
|
# print(self.map_buffer) |
|
92
|
|
|
|
|
93
|
|
|
# def _get_header(self, max_lens, prec, selected_clusters): |
|
94
|
|
|
# assert len(max_lens) == len(selected_clusters) |
|
95
|
|
|
# return ' - '.join( |
|
96
|
|
|
# 'id:{} len:{}'.format(cl.id, len(cl)) + ' ' * (prec + max_lens[i] - len(str(len(cl))) - 6) for i, cl in |
|
97
|
|
|
# enumerate(self.gen_clusters(selected_clusters))) + '\n' |
|
98
|
|
|
|
|
99
|
|
|
# def _get_rows(self, threshold=10, prob_precision=3): |
|
100
|
|
|
# max_token_lens = [max(map(lambda x: len(x[0]), cl.grams.most_common(threshold))) for cl in self.clusters] |
|
101
|
|
|
# b = '' |
|
102
|
|
|
# for i in range(threshold): |
|
103
|
|
|
# b += ' | '.join('{} '.format(cl.grams.most_common(threshold)[i][0]) + ' ' * ( |
|
104
|
|
|
# max_token_lens[j] - len(cl.grams.most_common(threshold)[i][0])) + |
|
105
|
|
|
# "{1:.{0}f}".format(prob_precision, cl.grams.most_common(threshold)[i][1] / len(cl)) for |
|
106
|
|
|
# j, cl in enumerate(self.clusters)) + '\n' |
|
107
|
|
|
# return b, max_token_lens |
|
108
|
|
|
|
|
109
|
|
|
|
|
110
|
|
|
# def distance(vec1, vec2, metric='euclidean'): |
|
111
|
|
|
# return DistanceMetric.get_metric(metric).pairwise([vec1, vec2])[0][1] |
|
112
|
|
|
|