Passed
Push — datapoints-package ( a11eff )
by Konstantinos
02:48
created

so_magic.clustering.clustering.ReportingClustering._get_rows()   A

Complexity

Conditions 3

Size

Total Lines 9
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 9
nop 3
dl 0
loc 9
rs 9.95
c 0
b 0
f 0
1
# import attr
2
3
# from .computing import ClusterDistroComputer
4
5
# @attr.s
6
# class BaseClustering:
7
#     """Items grouped in clusters/subgroups (eg based on proximity, similarity)"""
8
#     clusters = attr.ib(init=True)
9
#     id = attr.ib(init=True)
10
11
#     def __iter__(self):
12
#         return iter([(cluster.id, cluster) for cluster in self.clusters])
13
14
#     def __len__(self):
15
#         return len(self.clusters)
16
17
#     def __getitem__(self, item):
18
#         return self.clusters[item]
19
20
#     def members_n_assigned_clusters(self):
21
#         """Generate tuples of cluster members and their assigned cluster index"""
22
#         for i, cl in enumerate(self.clusters):
23
#             for member_id in iter(cl):
24
#                 yield member_id, i
25
26
27
# @attr.s
28
# class DatapointsCluster(BaseClustering):
29
#     """
30
#     Provide a method to return datapoints from a cluster and a method to get the values of all datapoints for a given attribute.
31
#     """
32
#     datapoints_extractor = attr.ib(init=True)  # call(cluster) -> datapoints
33
#     attributes_extractor = attr.ib(init=True)  # call(datapoints, attribute) -> attribute_value per datapoint iterable
34
35
#     distro_computer = attr.ib(init=False, default=attr.Factory(lambda self: ClusterDistroComputer.from_extractors(
36
#         self.datapoints_extractor, self.attributes_extractor), takes_self=True))
37
#     members = attr.ib(init=False, default={})  # structure to use to cache found items in the clustering, so that we do not seek them next time
38
39
40
# @attr.s
41
# class ReportingClustering(DatapointsCluster):
42
#     """
43
#     An instance of this class encapsulates the behaviour of a clustering; a set of clusters estimated on some data
44
#     """
45
46
#     pre = 2
47
#     def __str__(self):
48
#         body, max_lens = self._get_rows(threshold=10, prob_precision=pre)
49
#         header = self._get_header(max_lens, pre, [_ for _ in range(len(self))])
50
#         return header + body
51
52
#     def cluster_of(self, item):
53
#         h = hash(item)
54
#         return self.members.get(h, self._find_cluster(h))
55
56
#     def _find_cluster(self, item):
57
#         """Call this method to seek through the clusters for the given item."""
58
#         for cluster in self:
59
#             if item in cluster.members:
60
#                 self.members[item] = cluster.id
61
#                 return self.members[item]
62
63
#     def gen_clusters(self, selected):
64
#         """
65
#         Generates Cluster objects according to the indices in the selected clusters list
66
#         :param selected: the indices of the clusters to select
67
#         :type selected: list
68
#         :return: the generated cluster
69
#         :rtype: Cluster
70
#         """
71
#         for i in selected:
72
#             yield self[i]
73
74
#     def get_closest(self, an_id, n, metric='euclidean'):
75
#         """Call this method to find n closest vectors (within the same cluster) to the vector corresponding to the input id."""
76
#         return sorted((map(lambda x: distance(self.id2vec[an_id], x, metric=metric),
77
#                            [_ for _ in self[self.find_cluster(an_id)]])), reverse=True)[:n]
78
79
#     def compute_stats1(self, cluster, attributes):
80
#         self._stats = self.distro_computer(cluster, attributes)
81
82
#     def print_clusters(self, selected_clusters='all', threshold=10, prec=2):
83
#         if selected_clusters == 'all':
84
#             selected_clusters = range(len(self))
85
#         body, max_lens = self._get_rows(threshold=threshold, prob_precision=prec)
86
#         header = self._get_header(max_lens, prec, selected_clusters)
87
#         # header = ' - '.join('id:{} len:{}'.format(i, len(self[i])) + ' ' * (3-9 + prec + max_lens[i] - len(str(len(self[i])))) for i in selected_clusters) + '\n'
88
#         print(header + body)
89
90
#     def print_map(self):
91
#         print(self.map_buffer)
92
93
#     def _get_header(self, max_lens, prec, selected_clusters):
94
#         assert len(max_lens) == len(selected_clusters)
95
#         return ' - '.join(
96
#             'id:{} len:{}'.format(cl.id, len(cl)) + ' ' * (prec + max_lens[i] - len(str(len(cl))) - 6) for i, cl in
97
#             enumerate(self.gen_clusters(selected_clusters))) + '\n'
98
99
#     def _get_rows(self, threshold=10, prob_precision=3):
100
#         max_token_lens = [max(map(lambda x: len(x[0]), cl.grams.most_common(threshold))) for cl in self.clusters]
101
#         b = ''
102
#         for i in range(threshold):
103
#             b += ' | '.join('{} '.format(cl.grams.most_common(threshold)[i][0]) + ' ' * (
104
#                         max_token_lens[j] - len(cl.grams.most_common(threshold)[i][0])) +
105
#                             "{1:.{0}f}".format(prob_precision, cl.grams.most_common(threshold)[i][1] / len(cl)) for
106
#                             j, cl in enumerate(self.clusters)) + '\n'
107
#         return b, max_token_lens
108
109
110
# def distance(vec1, vec2, metric='euclidean'):
111
#     return DistanceMetric.get_metric(metric).pairwise([vec1, vec2])[0][1]
112