1
|
|
|
""" |
2
|
|
|
Module used to process information regarding the crowdsourcing platform. |
3
|
|
|
""" |
4
|
1 |
|
class Found(Exception): |
5
|
|
|
""" Exception. """ |
6
|
1 |
|
pass |
7
|
|
|
|
8
|
1 |
|
def get_platform(dframe): |
9
|
|
|
""" Get the crowdsourcing platform this file originates to """ |
10
|
|
|
|
11
|
1 |
|
if dframe.columns.values[0] == '_unit_id': |
12
|
|
|
# CrowdFlower |
13
|
1 |
|
return { |
14
|
|
|
#'_platform' : 'cf', |
15
|
|
|
'_id' : 'judgment', |
16
|
|
|
'_unit_id' : 'unit', |
17
|
|
|
'_worker_id' : 'worker', |
18
|
|
|
'_started_at' : 'started', |
19
|
|
|
'_created_at' : 'submitted' |
20
|
|
|
} |
21
|
1 |
|
elif dframe.columns.values[0] == 'HITId': |
22
|
|
|
# Mturk |
23
|
1 |
|
return { |
24
|
|
|
#'id' : 'amt', |
25
|
|
|
'AssignmentId' : 'judgment', |
26
|
|
|
'HITId' : 'unit', |
27
|
|
|
'WorkerId' : 'worker', |
28
|
|
|
'AcceptTime' : 'started', |
29
|
|
|
'SubmitTime' : 'submitted' |
30
|
|
|
} |
31
|
1 |
|
return False |
32
|
|
|
|
33
|
1 |
|
def configure_amt_columns(dframe, config): |
34
|
|
|
""" Configures AMT input and output columns. """ |
35
|
1 |
|
config.input = {} |
36
|
1 |
|
config.output = {} |
37
|
|
|
|
38
|
1 |
|
if config.inputColumns: |
39
|
1 |
|
config.input = {c: 'input.'+c.replace('Input.', '') \ |
40
|
|
|
for c in dframe.columns.values if c in config.inputColumns} |
41
|
|
|
else: |
42
|
|
|
config.input = {c: 'input.'+c.replace('Input.', '') \ |
43
|
|
|
for c in dframe.columns.values if c.startswith('Input.')} |
44
|
|
|
|
45
|
|
|
# if config is specified, use those columns |
46
|
1 |
|
if config.outputColumns: |
47
|
1 |
|
config.output = {c: 'output.'+c.replace('Answer.', '') \ |
48
|
|
|
for c in dframe.columns.values if c in config.outputColumns} |
49
|
|
|
else: |
50
|
|
|
config.output = {c: 'output.'+c.replace('Answer.', '') \ |
51
|
|
|
for c in dframe.columns.values if c.startswith('Answer.')} |
52
|
1 |
|
return config.input, config.output |
53
|
|
|
|
54
|
1 |
|
def configure_platform_columns(dframe, config): |
55
|
|
|
""" Configures FigureEight and custom platforms input and output columns. """ |
56
|
1 |
|
config.input = {} |
57
|
1 |
|
config.output = {} |
58
|
|
|
|
59
|
1 |
|
if config.inputColumns: |
60
|
1 |
|
config.input = {c: 'input.'+c for c in dframe.columns.values \ |
61
|
|
|
if c in config.inputColumns} |
62
|
1 |
|
if config.outputColumns: |
63
|
1 |
|
config.output = {c: 'output.'+c for c in dframe.columns.values \ |
64
|
|
|
if c in config.outputColumns} |
65
|
1 |
|
return config.input, config.output |
66
|
|
|
|
67
|
1 |
|
def configure_with_missing_columns(dframe, config): |
68
|
|
|
""" Identifies the type of the column based on naming """ |
69
|
|
|
units = dframe.groupby('_unit_id') |
70
|
|
|
columns = [c for c in dframe.columns.values if c != 'clustering' and not c.startswith('_') \ |
71
|
|
|
and not c.startswith('e_') and not c.endswith('_gold') \ |
72
|
|
|
and not c.endswith('_reason') and not c.endswith('browser')] |
73
|
|
|
for colname in columns: |
74
|
|
|
try: |
75
|
|
|
for _, unit in units: |
76
|
|
|
unique = unit[colname].nunique() |
77
|
|
|
if unique != 1 and unique != 0: |
78
|
|
|
raise Found |
79
|
|
|
if not config.inputColumns: |
80
|
|
|
config.input[colname] = 'input.'+colname |
81
|
|
|
|
82
|
|
|
except Found: |
83
|
|
|
if not config.outputColumns: |
84
|
|
|
config.output[colname] = 'output.'+colname |
85
|
|
|
|
86
|
|
|
return config |
87
|
|
|
|
88
|
1 |
|
def get_column_types(dframe, config): |
89
|
|
|
""" return input and output columns """ |
90
|
|
|
# returns a list of columns that contain are input content |
91
|
1 |
|
config.input = {} |
92
|
1 |
|
config.output = {} |
93
|
|
|
|
94
|
|
|
# get a dict of the columns with input content and the columns with output judgments |
95
|
|
|
# each entry matches [original column name]:[safestring column name] |
96
|
1 |
|
if dframe.columns.values[0] == 'HITId': |
97
|
|
|
# Mturk |
98
|
|
|
# if config is specified, use those columns |
99
|
1 |
|
config.input, config.output = configure_amt_columns(dframe, config) |
100
|
|
|
|
101
|
1 |
|
return config |
102
|
|
|
|
103
|
1 |
|
elif dframe.columns.values[0] == '_unit_id': |
104
|
|
|
|
105
|
|
|
# if a config is specified, use those columns |
106
|
1 |
|
config.input, config.output = configure_platform_columns(dframe, config) |
107
|
|
|
# if there is a config for both input and output columns, we can return those |
108
|
1 |
|
if config.inputColumns and config.outputColumns: |
109
|
1 |
|
return config |
110
|
|
|
|
111
|
|
|
# try to identify the input and output columns |
112
|
|
|
# this is the case if all the values in the column are identical |
113
|
|
|
# this is not failsafe but should give decent results without settings |
114
|
|
|
# it is best to make a settings.py file for a collection |
115
|
|
|
|
116
|
|
|
return configure_with_missing_columns(dframe, config) |
117
|
|
|
|
118
|
|
|
else: |
119
|
|
|
# unknown platform type |
120
|
|
|
|
121
|
|
|
# if a config is specified, use those columns |
122
|
1 |
|
config.input, config.output = configure_platform_columns(dframe, config) |
123
|
|
|
# if there is a config for both input and output columns, we can return those |
124
|
1 |
|
if config.inputColumns and config.outputColumns: |
125
|
|
|
return config |
126
|
|
|
|