GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.

experiment_replication_variable_workers   A
last analyzed

Complexity

Total Complexity 24

Size/Duplication

Total Lines 116
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 24
eloc 80
dl 0
loc 116
rs 10
c 0
b 0
f 0

10 Functions

Rating   Name   Duplication   Size   Complexity  
A get_uniq_unit_ids() 0 4 1
B create_analysis_files() 0 32 7
A main() 0 11 3
A gen_all_worker_combinations() 0 10 3
A get_no_work_unit_id() 0 4 1
A get_all_unit_combinations() 0 5 1
A pick_random_worker_set() 0 3 1
A gen_all_k_combinations() 0 8 3
A count_bits() 0 9 3
A my_product() 0 4 1
1
#!/usr/bin/env python2
2
# -*- coding: utf-8 -*-
3
"""
4
Created on Mon Jul  2 14:40:44 2018
5
"""
6
import os
7
import random
8
import sys
9
10
import itertools as it
11
import pandas as pd
12
13
def get_uniq_unit_ids(dframe, unit_id_field):
14
    """Get all unit ids in the output file"""
15
    unique_unit_ids = dframe[unit_id_field].unique()
16
    return unique_unit_ids
17
18
def get_no_work_unit_id(dframe, unit_id, unit_id_field):
19
    """Get the worker annotations for a unit"""
20
    subset_unit_id = dframe[dframe[unit_id_field] == unit_id]
21
    return (len(subset_unit_id), subset_unit_id)
22
23
def count_bits(number, n_bits):
24
    """ Optimization function for creating all possible combinations"""
25
    ret = 0
26
    bit_pos = []
27
    for i in range(0, n_bits):
28
        if (1 << i) & number != 0:
29
            ret += 1
30
            bit_pos.append(i)
31
    return (ret, bit_pos)
32
33
def gen_all_k_combinations(k, num_size):
34
    """ Generic function to compute combinations"""
35
    result = []
36
    for i in range(1, 2**num_size):
37
        bit_count, bit_pos = count_bits(i, num_size)
38
        if bit_count == k:
39
            result.append(bit_pos)
40
    return result
41
42
def gen_all_worker_combinations(subset_size, count, subset_unit_id, worker_id_field):
43
    """ Get all worker combinations"""
44
    combinations = gen_all_k_combinations(subset_size, count)
45
    final_result = []
46
    for comb in combinations:
47
        crnt_workers = []
48
        for j in range(0, len(comb)):
49
            crnt_workers.append(subset_unit_id[worker_id_field].iloc[comb[j]])
50
        final_result.append(crnt_workers)
51
    return final_result
52
53
def get_all_unit_combinations(unit_dict):
54
    """ Create worker combinations for all units"""
55
    sorted_unit_dict = sorted(unit_dict)
56
    combinations = it.product(*(unit_dict[unit_id] for unit_id in sorted_unit_dict))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable unit_id does not seem to be defined.
Loading history...
57
    print(list(combinations))
58
59
def my_product(dicts):
60
    """Create sets of workers"""
61
    units, comb_of_workers = zip(*dicts.items())
62
    return [dict(zip(units, x)) for x in it.product(*comb_of_workers)]
63
64
def pick_random_worker_set(worker_sets):
65
    """Pick random set of workers"""
66
    return random.choice(worker_sets)
67
68
69
def create_analysis_files(dataset_file, max_no_workers, max_runs, \
70
                          storing_folder, unit_id_field, worker_id_field):
71
    """Create files of various number of workers"""
72
    dataset = pd.read_csv(dataset_file)
73
    unique_unit_ids = get_uniq_unit_ids(dataset, unit_id_field)
74
75
    for subset_size in range(3, max_no_workers + 1):
76
        workers_directory = storing_folder + str(subset_size) + "workers"
77
        if not os.path.exists(workers_directory):
78
            os.makedirs(workers_directory)
79
80
        map_unit_id_combinations = {}
81
        for unit_id in range(0, len(unique_unit_ids)):
82
            (count, subset_unit_id) = get_no_work_unit_id(dataset, unique_unit_ids[unit_id], \
83
                                                          unit_id_field)
84
            combinations = gen_all_worker_combinations(subset_size, count, subset_unit_id, \
85
                                                       worker_id_field)
86
            map_unit_id_combinations[unique_unit_ids[unit_id]] = combinations
87
88
        for run_no in range(0, max_runs + 1):
89
            unit_worker_set = {}
90
            for unit_id, worker_sets in map_unit_id_combinations.items():
91
                unit_worker_set[unit_id] = pick_random_worker_set(worker_sets)
92
93
            df_subset_size = pd.DataFrame()
94
            for unit_id, worker_set in unit_worker_set.items():
95
                df_subset = dataset[(dataset[unit_id_field] == unit_id) &
96
                                    (dataset[worker_id_field].isin(worker_set))]
97
                frames = [df_subset_size, df_subset]
98
                df_subset_size = pd.concat(frames)
99
100
            df_subset_size.to_csv(workers_directory + "/run_" + str(run_no) + ".csv", index=False)
101
102
def main(argv=None):
103
    """Run the script"""
104
    if argv is None:
105
        argv = sys.argv
106
107
    if len(argv) < 6:
108
        print('Usage: python replication_experiment_wrt_workers.py dataset_filename'
109
              ' max_no_workers max_runs storing_folder unit_id_field, worker_id_field')
110
111
    else:
112
        create_analysis_files(argv[0], argv[1], argv[2], argv[3], argv[4], argv[5])
113
114
if __name__ == '__main__':
115
    main()
116