|
1
|
|
|
import numpy as np |
|
2
|
|
|
|
|
3
|
|
|
|
|
4
|
|
|
class DistroReporter(object): |
|
5
|
|
|
|
|
6
|
|
|
def __init__(self): |
|
7
|
|
|
self.cl = None |
|
8
|
|
|
self.var = '' |
|
9
|
|
|
self.sl = None |
|
10
|
|
|
|
|
11
|
|
|
self.ordered_value_labels = [] |
|
12
|
|
|
self.max_nb_rows = 0 |
|
13
|
|
|
self.max_label_len = 0 |
|
14
|
|
|
self.generators = [] |
|
15
|
|
|
|
|
16
|
|
|
def print_distros(self, clustering, variable, selected_clusters='all', prec=3): |
|
17
|
|
|
""" |
|
18
|
|
|
Prints the discrete distribution of the values the input variable takes for evry selected cluster. Frequencies are shown in descending order.\n |
|
19
|
|
|
:param clustering: the Clustering to select from |
|
20
|
|
|
:type clustering: clustering.Cluster |
|
21
|
|
|
:param variable: the field name of interest |
|
22
|
|
|
:type variable: str |
|
23
|
|
|
:param selected_clusters: can be a list of indices pointing to Cluster objects in the Clustering structure. Can take the 'all' value to indicate selecting every cluster |
|
24
|
|
|
:type selected_clusters: list or str |
|
25
|
|
|
:param prec: the precision of the frequencies to visualize; the number of decimal digits to show |
|
26
|
|
|
:type prec: int |
|
27
|
|
|
""" |
|
28
|
|
|
self._set_state(clustering, variable, selected_clusters, prec) |
|
29
|
|
|
body = '' |
|
30
|
|
|
for i in range(self.max_nb_rows): |
|
31
|
|
|
body += ' | '.join(str(self.generators[j].__next__()) for j in range(len(self.sl))) + '\n' |
|
32
|
|
|
header = ' - '.join('id:{} len:{}'.format(cl.id, len(cl)) + ' '*(prec + self.max_label_len[i] - len(str(len(cl))) - 6) for i, cl in enumerate(self.cl.gen_clusters(self.sl))) + '\n' |
|
33
|
|
|
print(header + body) |
|
34
|
|
|
|
|
35
|
|
|
def _set_state(self, clustering, variable, selected_clusters, prec): |
|
36
|
|
|
self.cl = clustering |
|
37
|
|
|
self.var = variable |
|
38
|
|
|
if selected_clusters == 'all': |
|
39
|
|
|
selected_clusters = range(len(self.cl)) |
|
40
|
|
|
self.sl = selected_clusters |
|
41
|
|
|
self.ordered_value_labels = [sorted(cl.freqs[self.var], key=lambda x: cl.freqs[self.var][x], reverse=True) for cl in self.cl.gen_clusters(self.sl)] |
|
|
|
|
|
|
42
|
|
|
self.max_nb_rows = max(map(lambda x: len(x), self.ordered_value_labels)) |
|
43
|
|
|
self.max_label_len = [max(map(lambda x: len(x), cl.freqs[variable])) for cl in self.cl.gen_clusters(self.sl)] |
|
44
|
|
|
self.generators = [self._get_generator(i, prec) for i in range(len(self.sl))] |
|
45
|
|
|
|
|
46
|
|
|
def _gen_entries(self, ind, prec): |
|
47
|
|
|
i = 0 |
|
48
|
|
|
for i, el in enumerate(self.ordered_value_labels[ind]): |
|
49
|
|
|
yield '{0} {1}{3:.{2}f}'.format(el, ' '*(self.max_label_len[ind] - len(el)), prec, self.cl[self.sl[ind]].freqs[self.var][el]) |
|
50
|
|
|
while i < self.max_nb_rows - 1: |
|
51
|
|
|
yield ' ' * (self.max_label_len[ind] + prec + 3) |
|
52
|
|
|
i += 1 |
|
53
|
|
|
|
|
54
|
|
|
def _get_generator(self, ind, prec): |
|
55
|
|
|
return (_ for _ in self._gen_entries(ind, prec)) |
|
|
|
|
|
|
56
|
|
|
|