|
1
|
|
|
""" |
|
2
|
|
|
Module used to process information regarding the crowdsourcing platform. |
|
3
|
|
|
""" |
|
4
|
1 |
|
class Found(Exception): |
|
5
|
|
|
""" Exception. """ |
|
6
|
1 |
|
pass |
|
7
|
|
|
|
|
8
|
1 |
|
def get_platform(dframe): |
|
9
|
|
|
""" Get the crowdsourcing platform this file originates to """ |
|
10
|
|
|
|
|
11
|
1 |
|
if dframe.columns.values[0] == '_unit_id': |
|
12
|
|
|
# CrowdFlower |
|
13
|
1 |
|
return { |
|
14
|
|
|
#'_platform' : 'cf', |
|
15
|
|
|
'_id' : 'judgment', |
|
16
|
|
|
'_unit_id' : 'unit', |
|
17
|
|
|
'_worker_id' : 'worker', |
|
18
|
|
|
'_started_at' : 'started', |
|
19
|
|
|
'_created_at' : 'submitted' |
|
20
|
|
|
} |
|
21
|
1 |
|
elif dframe.columns.values[0] == 'HITId': |
|
22
|
|
|
# Mturk |
|
23
|
1 |
|
return { |
|
24
|
|
|
#'id' : 'amt', |
|
25
|
|
|
'AssignmentId' : 'judgment', |
|
26
|
|
|
'HITId' : 'unit', |
|
27
|
|
|
'WorkerId' : 'worker', |
|
28
|
|
|
'AcceptTime' : 'started', |
|
29
|
|
|
'SubmitTime' : 'submitted' |
|
30
|
|
|
} |
|
31
|
1 |
|
return False |
|
32
|
|
|
|
|
33
|
1 |
|
def configure_amt_columns(dframe, config): |
|
34
|
|
|
""" Configures AMT input and output columns. """ |
|
35
|
1 |
|
config.input = {} |
|
36
|
1 |
|
config.output = {} |
|
37
|
|
|
|
|
38
|
1 |
|
if config.inputColumns: |
|
39
|
1 |
|
config.input = {c: 'input.'+c.replace('Input.', '') \ |
|
40
|
|
|
for c in dframe.columns.values if c in config.inputColumns} |
|
41
|
|
|
else: |
|
42
|
|
|
config.input = {c: 'input.'+c.replace('Input.', '') \ |
|
43
|
|
|
for c in dframe.columns.values if c.startswith('Input.')} |
|
44
|
|
|
|
|
45
|
|
|
# if config is specified, use those columns |
|
46
|
1 |
|
if config.outputColumns: |
|
47
|
1 |
|
config.output = {c: 'output.'+c.replace('Answer.', '') \ |
|
48
|
|
|
for c in dframe.columns.values if c in config.outputColumns} |
|
49
|
|
|
else: |
|
50
|
|
|
config.output = {c: 'output.'+c.replace('Answer.', '') \ |
|
51
|
|
|
for c in dframe.columns.values if c.startswith('Answer.')} |
|
52
|
1 |
|
return config.input, config.output |
|
53
|
|
|
|
|
54
|
1 |
|
def configure_platform_columns(dframe, config): |
|
55
|
|
|
""" Configures FigureEight and custom platforms input and output columns. """ |
|
56
|
1 |
|
config.input = {} |
|
57
|
1 |
|
config.output = {} |
|
58
|
|
|
|
|
59
|
1 |
|
if config.inputColumns: |
|
60
|
1 |
|
config.input = {c: 'input.'+c for c in dframe.columns.values \ |
|
61
|
|
|
if c in config.inputColumns} |
|
62
|
1 |
|
if config.outputColumns: |
|
63
|
1 |
|
config.output = {c: 'output.'+c for c in dframe.columns.values \ |
|
64
|
|
|
if c in config.outputColumns} |
|
65
|
1 |
|
return config.input, config.output |
|
66
|
|
|
|
|
67
|
1 |
|
def configure_with_missing_columns(dframe, config): |
|
68
|
|
|
""" Identifies the type of the column based on naming """ |
|
69
|
|
|
units = dframe.groupby('_unit_id') |
|
70
|
|
|
columns = [c for c in dframe.columns.values if c != 'clustering' and not c.startswith('_') \ |
|
71
|
|
|
and not c.startswith('e_') and not c.endswith('_gold') \ |
|
72
|
|
|
and not c.endswith('_reason') and not c.endswith('browser')] |
|
73
|
|
|
for colname in columns: |
|
74
|
|
|
try: |
|
75
|
|
|
for _, unit in units: |
|
76
|
|
|
unique = unit[colname].nunique() |
|
77
|
|
|
if unique != 1 and unique != 0: |
|
78
|
|
|
raise Found |
|
79
|
|
|
if not config.inputColumns: |
|
80
|
|
|
config.input[colname] = 'input.'+colname |
|
81
|
|
|
|
|
82
|
|
|
except Found: |
|
83
|
|
|
if not config.outputColumns: |
|
84
|
|
|
config.output[colname] = 'output.'+colname |
|
85
|
|
|
|
|
86
|
|
|
return config |
|
87
|
|
|
|
|
88
|
1 |
|
def get_column_types(dframe, config): |
|
89
|
|
|
""" return input and output columns """ |
|
90
|
|
|
# returns a list of columns that contain are input content |
|
91
|
1 |
|
config.input = {} |
|
92
|
1 |
|
config.output = {} |
|
93
|
|
|
|
|
94
|
|
|
# get a dict of the columns with input content and the columns with output judgments |
|
95
|
|
|
# each entry matches [original column name]:[safestring column name] |
|
96
|
1 |
|
if dframe.columns.values[0] == 'HITId': |
|
97
|
|
|
# Mturk |
|
98
|
|
|
# if config is specified, use those columns |
|
99
|
1 |
|
config.input, config.output = configure_amt_columns(dframe, config) |
|
100
|
|
|
|
|
101
|
1 |
|
return config |
|
102
|
|
|
|
|
103
|
1 |
|
elif dframe.columns.values[0] == '_unit_id': |
|
104
|
|
|
|
|
105
|
|
|
# if a config is specified, use those columns |
|
106
|
1 |
|
config.input, config.output = configure_platform_columns(dframe, config) |
|
107
|
|
|
# if there is a config for both input and output columns, we can return those |
|
108
|
1 |
|
if config.inputColumns and config.outputColumns: |
|
109
|
1 |
|
return config |
|
110
|
|
|
|
|
111
|
|
|
# try to identify the input and output columns |
|
112
|
|
|
# this is the case if all the values in the column are identical |
|
113
|
|
|
# this is not failsafe but should give decent results without settings |
|
114
|
|
|
# it is best to make a settings.py file for a collection |
|
115
|
|
|
|
|
116
|
|
|
return configure_with_missing_columns(dframe, config) |
|
117
|
|
|
|
|
118
|
|
|
else: |
|
119
|
|
|
# unknown platform type |
|
120
|
|
|
|
|
121
|
|
|
# if a config is specified, use those columns |
|
122
|
1 |
|
config.input, config.output = configure_platform_columns(dframe, config) |
|
123
|
|
|
# if there is a config for both input and output columns, we can return those |
|
124
|
1 |
|
if config.inputColumns and config.outputColumns: |
|
125
|
|
|
return config |
|
126
|
|
|
|