ResultsHandler.get_all_columns()   A
last analyzed

Complexity

Conditions 3

Size

Total Lines 4
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 4
nop 2
dl 0
loc 4
rs 10
c 0
b 0
f 0
1
import os
2
import re
3
import warnings
4
from glob import glob
5
from functools import reduce
6
7
import attr
8
9
from topic_modeling_toolkit.results import ExperimentalResults
10
11
from .fitness import FitnessFunction
12
13
14
KERNEL_SUB_ENTITIES = ('coherence', 'contrast', 'purity', 'size')
15
16
MAX_DECIMALS = 2  # this consant should agree with the patm.modeling.experiment.Experiment.MAX_DECIMALS
17
18
def _get_kernel_sub_hash(entity):
19
    assert entity in KERNEL_SUB_ENTITIES
20
    return {'kernel-'+entity: {'scalar-extractor': lambda x,y: getattr(getattr(x.tracked, 'kernel'+y[2:]).average, entity).last if hasattr(x.tracked, 'kernel'+y[2:]) else None,
21
                               'list-extractor': lambda x, y: getattr(getattr(x.tracked, 'kernel' + y[2:]).average, entity).all if hasattr(x.tracked, 'kernel' + y[2:]) else None,
22
                               'column-title': lambda x: 'k'+entity[:3:2]+'.'+str(x)[2:],
23
                               'to-string': '{:.4f}',
24
                               'definitions': lambda x: ['kernel-{}-{}'.format(entity, y) for y in x.tracked.kernel_thresholds]}}
25
26
COLUMNS_HASH = {
27
    'nb-topics': {'scalar-extractor': lambda x: x.scalars.nb_topics,
28
                  'column-title': lambda: 'tpcs'},
29
    'collection-passes': {'scalar-extractor': lambda x: x.scalars.dataset_iterations,
30
                          'column-title': lambda: 'col-i'},
31
    'document-passes': {'scalar-extractor': lambda x: x.scalars.document_passes,
32
                        'column-title': lambda: 'doc-i'},
33
    'total-phi-updates': {'scalar-extractor': lambda x: x.scalars.dataset_iterations * x.scalars.document_passes,
34
                          'column-title': lambda: 'phi-u'},
35
    'perplexity': {'scalar-extractor': lambda x: x.tracked.perplexity.last,
36
                   'list-extractor': lambda x: x.tracked.perplexity.all,
37
                   'column-title': lambda: 'prpl',
38
                   'to-string': '{:.1f}'},
39
    'top-tokens-coherence': {'scalar-extractor': lambda x,y: getattr(x.tracked, 'top'+str(y)).average_coherence.last if hasattr(x.tracked, 'top'+str(y)) else None,
40
                             'list-extractor': lambda x,y: getattr(x.tracked, 'top'+str(y)).average_coherence.all if hasattr(x.tracked, 'top'+str(y)) else None,
41
                             'column-title': lambda x: 'top'+str(x)+'ch',
42
                             'to-string': '{:.4f}',
43
                             'definitions': lambda x: ['top-tokens-coherence-'+str(y) for y in x.tracked.top_tokens_cardinalities]},
44
    'sparsity-phi': {'scalar-extractor': lambda x,y: getattr(x.tracked, 'sparsity_phi_'+y).last if hasattr(x.tracked, 'sparsity_phi_'+y) else None,
45
                     'list-extractor': lambda x,y: getattr(x.tracked, 'sparsity_phi_'+y).all if hasattr(x.tracked, 'sparsity_phi_'+y) else None,
46
                     'column-title': lambda y: 'spp@'+y,
47
                     'to-string': '{:.2f}',
48
                     'definitions': lambda x: ['sparsity-phi-{}'.format(y) for y in x.tracked.modalities_initials]},
49
    'sparsity-theta': {'scalar-extractor': lambda x: x.tracked.sparsity_theta.last,
50
                       'list-extractor': lambda x: x.tracked.sparsity_theta.all,
51
                       'column-title': lambda: 'spt',
52
                       'to-string': '{:.2f}'},
53
    'background-tokens-ratio': {'scalar-extractor': lambda x,y: getattr(x.tracked, 'background_tokens_ratio_'+str(y)[2:]).last if hasattr(x.tracked, 'background_tokens_ratio_'+str(y)[2:]) else None,
54
                                # 'list-extractor': lambda x,y: getattr(x.tracked, 'background_tokens_ratio_'+str(y)[2:]).all if hasattr(x.tracked, 'background_tokens_ratio_'+str(y)[2:]) else None,
55
                                'list-extractor': lambda x,y: getattr(x.tracked, 'background_tokens_ratio_'+str(y)[2:]).all if hasattr(x.tracked, 'background_tokens_ratio_'+str(y)[2:]) else None,
56
                                'column-title': lambda x: 'btr.'+str(x)[2:],
57
                                'to-string': '{:.2f}',
58
                                'definitions': lambda x: ['background-tokens-ratio-{}'.format(y[:4]) for y in x.tracked.background_tokens_thresholds]},
59
    'regularizers': {'scalar-extractor': lambda x: '[{}]'.format(', '.join(map(regularizers_format, x.regularizers))),
60
                     'column-title': lambda: 'regs',},
61
    'kernel-size': {'scalar-extractor': lambda x,y: getattr(x.tracked, 'kernel'+y[2:]).average.size.last if hasattr(x.tracked, 'kernel'+y[2:]) else None,
62
                               'list-extractor': lambda x, y: getattr(x.tracked, 'kernel' + y[2:]).average.size.all if hasattr(x.tracked, 'kernel' + y[2:]) else None,
63
                               'column-title': lambda x: 'k'+'size'[:3:2]+'.'+str(x)[2:],
64
                               'to-string': '{:.1f}',
65
                               'definitions': lambda x: ['kernel-size-{}'.format(y) for y in x.tracked.kernel_thresholds]}
66
}
67
68
def regularizers_format(reg_def_string):
69
    return reg_def_string
70
    # try:
71
    #     return '-'.join(re.findall(r"(?:^|-)(\w{1,4})\w*", reg_def_string[:reg_def_string.index("|")])) + reg_def_string[reg_def_string.index("|"):]
72
    # except ValueError as e:
73
    #     print(e)
74
    #     return reg_def_string
75
76
COLUMNS_HASH = reduce(lambda x, y: dict(y, **x), [COLUMNS_HASH] + [_get_kernel_sub_hash(z) for z in KERNEL_SUB_ENTITIES])
77
78
79
class ResultsHandler(object):
80
    _list_selector_hash = {str: lambda x: x[0] if x[1] == 'all' else None,
81
                           range: lambda x: [x[0][_] for _ in x[1]],
82
                           int: lambda x: x[0][:x[1]],
83
                           list: lambda x: [x[0][_] for _ in x[1]]}
84
    _QUANTITY_2_EXTRACTOR_KEY = {'last': 'scalar', 'all': 'list'}
85
    DYNAMIC_COLUMNS = ['kernel-size', 'kernel-coherence', 'kernel-contrast', 'kernel-purity', 'top-tokens-coherence', 'sparsity-phi', 'background-tokens-ratio']
86
    DEFAULT_COLUMNS = ['nb-topics', 'collection-passes', 'document-passes', 'total-phi-updates', 'perplexity'] +\
87
                      DYNAMIC_COLUMNS[:-1] + ['sparsity-theta'] + [DYNAMIC_COLUMNS[-1]] + ['regularizers']
88
89
    def __init__(self, collections_root_path, results_dir_name='results'):
90
        self._collections_root = collections_root_path
91
        self._results_dir_name = results_dir_name
92
        self._results_hash = {}
93
        self._fitness_function_hash = {}
94
        self._list_selector = None
95
96
    def get_experimental_results(self, collection_name, sort='', selection='all'):
97
        """
98
        Call this method to get a list of experimental result objects from topic models trained on the given collection.\n
99
        :param str collection_name:
100
        :param str sort: if None the experimental results are obtained alphabetically on their json path
101
        :param str or range or int or list selection: whether to select a subset of the experimental results fron the given collection\n
102
            - if selection == 'all', returns every experimental results object "extracted" from the jsons
103
            - if type(selection) == range, returns a "slice" of the experimental results based on the range
104
            - if type(selection) == int, returns the first n experimental results
105
            - if type(selection) == list, then it represents specific indices to sample the list of experimental results from
106
        :return: the ExperimentalResults objects
107
        :rtype: list
108
        """
109
        result_paths = glob('{}/*.json'.format(os.path.join(self._collections_root, collection_name, self._results_dir_name)))
110
        if type(selection) == list and all(type(x) == str for x in selection):  # if input list contains model labels
111
            e = self._get_experimental_results([_ for _ in result_paths if re.search(r'/(?:{})\.json'.format('|'.join(selection)), _)])
112
            if len(e) != len(selection):
113
                raise ValueError("len1 = {}, len2 = {}\nseq1 = {}\nseq2 = {}".format(len(e), len(selection), [_.scalars.model_label for _ in e], selection))
114
            return e
115
        self._list_selector = lambda y: ResultsHandler._list_selector_hash[type(selection)]([y, selection])
116
        r = self._get_experimental_results(result_paths, metric_sorter=self._get_metric(sort))
117
118
        assert len(result_paths) == len(r)
119
        return self._list_selector(r)
120
121
    def _get_experimental_results(self, results_paths, metric_sorter=None):
122
        # print('_get_experimental_results.metric_sorter: {}'.format(metric_sorter))
123
        if metric_sorter is None:
124
            return [self._process_result_path(_) for _ in sorted(results_paths)]
125
        return metric_sorter([self._process_result_path(x) for x in results_paths])
126
        # sorted([self._process_result_path(x) for x in results_paths], key=metric_sorter, reverse=True)
127
        # sorter = ResultsSorter.from_function(metric_sorter)
128
        # return sorter([self._process_result_path(x) for x in results_paths])
129
130
        # if metric_sorter:
131
        #     assert hasattr(metric_sorter, '__call__')
132
        #     print(' Metric function:', metric_sorter.__name__)
133
        #
134
        #
135
136
    def _process_result_path(self, result_path):
137
        if result_path not in self._results_hash:
138
            self._results_hash[result_path] = ExperimentalResults.create_from_json_file(result_path)
139
        return self._results_hash[result_path]
140
141
    def _get_metric(self, metric):
142
        if not metric:
143
            return None
144
        if metric not in self._fitness_function_hash:
145
            self._fitness_function_hash[metric] = FitnessFunction.single_metric(metric)
146
        return MetricSorter(metric, lambda x: self._fitness_function_hash[metric].compute([ResultsHandler.extract(x, metric, 'last')]))
147
148
    @staticmethod
149
    def get_titles(column_definitions):
150
        return [ResultsHandler.get_abbreviation(x) for x in column_definitions]
151
152
    @staticmethod
153
    def get_abbreviation(definition):
154
        tokens, parameters = ResultsHandler._parse_column_definition(definition)
155
        return COLUMNS_HASH['-'.join(tokens)]['column-title'](*parameters)
156
157
    @staticmethod
158
    def stringnify(column, value):
159
        """
160
        :param str column: key or definition; example values: 'perplexity', 'kernel-coherence', 'kernel-coherence-0.80'
161
        :param value:
162
        :return:
163
        :rtype str
164
        """
165
        return COLUMNS_HASH.get(column, COLUMNS_HASH[ResultsHandler._get_hash_key(column)]).get('to-string', '{}').format(value)
166
        
167
    # @staticmethod
168
    # def get_tau_trajectory(exp_results, matrix_name):
169
    #     """
170
    #
171
    #     :param results.experimental_results.ExperimentalResults exp_results:
172
    #     :param matrix_name:
173
    #     :return:
174
    #     """
175
    #     return getattr(exp_results.tracked.tau_trajectories, matrix_name).all
176
177
    @staticmethod
178
    def extract(exp_results, column_definition, quantity):
179
        """
180
        Call this method to query the given experimental results object about a specific metric. Supports requesting all
181
        values tracked along the training process.
182
        :param results.experimental_results.ExperimentalResults exp_results:
183
        :param str column_definition:
184
        :param str quantity: must be one of {'last', 'all'}
185
        :return:
186
        """
187
        tokens, parameters = ResultsHandler._parse_column_definition(column_definition)
188
        return COLUMNS_HASH['-'.join(tokens)][ResultsHandler._QUANTITY_2_EXTRACTOR_KEY[quantity] + '-extractor'](*list([exp_results] + parameters))
189
190
    @staticmethod
191
    def get_all_columns(exp_results, requested_entities):
192
        return reduce(lambda i, j: i + j,
193
                      [COLUMNS_HASH[x]['definitions'](exp_results) if x in ResultsHandler.DYNAMIC_COLUMNS else [x] for x in requested_entities])
194
195
    ###### UTILITY FUNCTIONS ######
196
    @staticmethod
197
    def _parse_column_definition(definition):
198
        return [list([_f for _f in y if _f]) for y in zip(*[(x, None) if ResultsHandler._is_token(x) else (None, x) for x in definition.split('-')])]
199
200
    @staticmethod
201
    def _get_hash_key(column_definition):
202
        return '-'.join([_ for _ in column_definition.split('-') if ResultsHandler._is_token(_)])
203
204
    @staticmethod
205
    def _is_token(definition_element):
206
        try:
207
            _ = float(definition_element)
208
            return False
209
        except ValueError:
210
            if definition_element[0] == '@' or len(definition_element) == 1:
211
                return False
212
            return True
213
214
    @staticmethod
215
    def _label_selection(labels, experimental_results_list):
216
        """Returns the indices of the input labels based on the input experimental results list"""
217
        model_labels = [x.scalars.model_label for x in experimental_results_list]
218
        return [experimental_results_list.index(model_labels.index(l)) for l in labels if l in model_labels]
219
220
221
##########################
222
def _build_sorter(instance, attribute, value):
223
    if not hasattr(value, '__call__'):
224
        raise TypeError("Second constructor argument should be a callable object, in case the first is 'alphabetical'")
225
    # this is sorting from 'bigger' to 'smaller' (independently of how <,>, operators have been defined)
226
    instance.experimental_result_sorter = lambda exp_res_objs_list: sorted(exp_res_objs_list, key=value, reverse=True)
227
228
@attr.s(cmp=True, repr=True, str=True)
229
class MetricSorter(object):
230
    name = attr.ib(init=True, converter=str, cmp=True, repr=True)
231
    experimental_result_sorter = attr.ib(init=True, validator=_build_sorter, cmp=True, repr=True)
232
233
    @classmethod
234
    def from_function(cls, function):
235
        return ResultsSorter(function.__name__, function)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable ResultsSorter does not seem to be defined.
Loading history...
236
    def __call__(self, *args, **kwargs):
237
        return self.experimental_result_sorter(args[0])
238
############################
239
240
241
if __name__ == '__main__':
242
    ms = ModelSelector(collection_results_dir_path='/data/thesis/data/collections/dd/results')
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable ModelSelector does not seem to be defined.
Loading history...
243