GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Completed
Push — master ( d75a64...aeff40 )
by Oana
22:51
created

crowdtruth.load.getPlatform()   A

Complexity

Conditions 3

Size

Total Lines 26
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 4.125

Importance

Changes 0
Metric Value
eloc 16
dl 0
loc 26
ccs 3
cts 6
cp 0.5
rs 9.6
c 0
b 0
f 0
cc 3
nop 1
crap 4.125
1 1
import os
2
3 1
import logging
4 1
import pdb
5
6 1
import sys  
7 1
import chardet
8 1
import re, string
9 1
import pandas as pd
10 1
import numpy as np
11 1
from datetime import datetime
12 1
from collections import Counter, OrderedDict
13 1
import re
14
15 1
from crowdtruth.models.metrics import *
16 1
from crowdtruth.models.worker import *
17 1
from crowdtruth.models.unit import *
18 1
from crowdtruth.models.job import *
19 1
from crowdtruth.configuration import DefaultConfig
20
21
22
23
24
# create an ordered counter so that we can maintain the position of tags in the order they were annotated
25 1
class OrderedCounter(Counter, OrderedDict):
26 1
	pass
27
28 1
def createOrderedCounter(orderedCounter, annotation_vector):
29 1
	for relation in annotation_vector:
30 1
		if relation not in orderedCounter:
31 1
			orderedCounter.update({relation: 0})
32 1
	return orderedCounter
33
34
35 1
class Found(Exception): pass
36
37 1
def validateTimestampField(date_string, date_format):
38
	try:
39
	  date_obj = datetime.datetime.strptime(date_string, date_format)
40
	  print(date_obj)
41
	except ValueError:
42
	  raise ValueError('Incorrect date format')
43
44 1
def getFileList(directory):
45
	filelist = []
46
47
	# go through all files in this folder
48
	for f in os.listdir(directory):
49
		# if it is a folder scan it
50
		if os.path.isdir(directory+'/'+f):
51
			sublist = getFileList(directory+'/'+f)
52
			if len(sublist):
53
				filelist.append(sublist)
54
55
		# if it is a csv file open it
56
		elif f.endswith('.csv') and f != 'groundtruth.csv':
57
			filelist.append(f)
58
	return filelist
59
60 1
def load(**kwargs):
61
62
	# placeholder for aggregated results
63 1
	results = {
64
		'jobs' : [],
65
		'units' : [],
66
		'workers' : [],
67
		'judgments' : [],
68
		'annotations' : []
69
		}
70
71
72 1
	if 'config' not in kwargs:
73
		config = DefaultConfig()
74
	else:
75 1
		logging.info('Config loaded')
76 1
		config = kwargs['config']
77
78
	# check if files is a single file or folder
79 1
	if('file' in kwargs and kwargs['file'].endswith('.csv')):
80 1
		files = [kwargs['file']]
81
	elif('directory' in kwargs):
82
		directory = kwargs['directory']
83
		files = getFileList(directory)
84
		logging.info('Found ' + str(len(files)) + ' files')
85
	else:
86
		raise ValueError('No input was provided')
87
88
89 1
	for f in files:
90 1
		if 'directory' in locals():
91
			logging.info("Processing " + f)
92
			f = directory + "/" + f
93 1
		res, config = processFile(f, config)
94 1
		for x in res:
95 1
			results[x].append(res[x])
96
97
98 1
	for x in results:
99 1
		results[x] = pd.concat(results[x])
100
101
102
	# workers and annotations can appear across jobs, so we have to aggregate those extra
103 1
	results['workers'] = results['workers'].groupby(results['workers'].index).agg({
104
		'unit' : 'sum',
105
		'judgment' : 'sum',
106
		'job' : 'count',
107
		'duration' : 'mean'
108
		})
109
110
	# aggregate annotations
111 1
	results['annotations'] = results['annotations'].groupby(results['annotations'].index).sum()
112
	
113 1
	return results, config
114
115
116 1
def processFile(filename, config):
117
118
119 1
	job = filename.split('.csv')[0]
120
121 1
	judgments = pd.read_csv(filename)#, encoding=result['encoding'])
122
123 1
	collection = ''
124
125 1
	platform = getPlatform(judgments)
126
127 1
	if platform == False:
128
		logging.info("Custom crowdsourcing platform!")
129
130
		if (len(config.customPlatformColumns) != 5):
131
			logging.warning("The following column names are required: judgment id, unit id, worker id, start time, submit time")
132
			raise ValueError('No custom platform configuration was provided')
133
		else:
134
135
			platform = {
136
				#'id'		: 'custom',
137
				config.customPlatformColumns[0] : 'judgment',
138
				config.customPlatformColumns[1] : 'unit',
139
				config.customPlatformColumns[2] : 'worker',
140
				config.customPlatformColumns[3]	: 'started',
141
				config.customPlatformColumns[4]	: 'submitted'
142
			}
143
144
145
	# we must establish which fields were part of the input data and which are output judgments
146
	# if there is a config, check if there is a definition of which fields to use
147
	#config = []
148
	# else use the default and select them automatically
149 1
	config = getColumnTypes(judgments, config)
150
151
	# remove rows where the worker did not give an answer (AMT issue)
152 1
	empty_rows = set()
153 1
	for col in config.outputColumns:
154 1
		empty_rows = empty_rows.union(judgments[pd.isnull(judgments[col]) == True].index)
155 1
	for col in config.outputColumns:
156 1
		judgments = judgments[pd.isnull(judgments[col]) == False]
157 1
	judgments = judgments.reset_index(drop=True)
158 1
	if len(empty_rows) > 0:
159
		if len(empty_rows) == 1:
160
			logging.warning(str(len(empty_rows)) + " row with incomplete information in output columns was removed.")
161
		else:
162
			logging.warning(str(len(empty_rows)) + " rows with incomplete information in output columns were removed.")
163
164
	# allow customization of the judgments
165 1
	judgments = config.processJudgments(judgments)
166
167
	# update the config after the preprocessing of judgments
168 1
	config = getColumnTypes(judgments, config)
169
170 1
	allColumns = dict(list(config.input.items()) + list(config.output.items()) + list(platform.items()))
171
	# allColumns = dict(config.input.items() | config.output.items() | platform.items())
172 1
	judgments = judgments.rename(columns=allColumns)
173
174
	# remove columns we don't care about
175 1
	judgments = judgments[list(allColumns.values())]
176
177 1
	judgments['job'] = job
178
179
	# make output values safe keys
180 1
	for col in config.output.values():
181 1
		if type(judgments[col].iloc[0]) is dict:
182
			logging.info("Values stored as dictionary")
183
			if config.open_ended_task:
184
				judgments[col] = judgments[col].apply(lambda x: OrderedCounter(x))
185
			else:
186
				judgements[col] = judgements[col].apply(lambda x: createOrderedCounter(OrderedCounter(x), config.annotation_vector))	
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable judgements does not seem to be defined.
Loading history...
187
		else:
188 1
			logging.info("Values not stored as dictionary")
189 1
			if config.open_ended_task:
190 1
				judgments[col] = judgments[col].apply(lambda x: OrderedCounter(x.split(config.annotation_separator)))
191
			else:
192 1
				judgments[col] = judgments[col].apply(lambda x: createOrderedCounter(OrderedCounter(x.split(config.annotation_separator)), config.annotation_vector))
193
194 1
	judgments['started'] = judgments['started'].apply(lambda x: pd.to_datetime(str(x)))
195 1
	judgments['submitted'] = judgments['submitted'].apply(lambda x: pd.to_datetime(str(x)))
196 1
	judgments['duration'] = judgments.apply(lambda row: (row['submitted'] - row['started']).seconds, axis=1)
197
198
	# remove units with just 1 judgment
199 1
	units_1work = judgments.groupby('unit').filter(lambda x: len(x) == 1)["unit"]
200 1
	judgments = judgments[~judgments['unit'].isin(units_1work)]
201 1
	judgments = judgments.reset_index(drop=True)
202 1
	if len(units_1work) > 0:
203
		if len(units_1work) == 1:
204
			logging.warning(str(len(units_1work)) + " Media Unit that was annotated by only 1 Worker was omitted, since agreement cannot be calculated.")
205
		else:
206
			logging.warning(str(len(units_1work)) + " Media Units that were annotated by only 1 Worker were omitted, since agreement cannot be calculated.")
207
208
	#
209
	# aggregate units
210
	#
211 1
	units = Unit.aggregate(judgments, config)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable Unit does not seem to be defined.
Loading history...
212
213 1
	for col in config.output.values():
214 1
		judgments[col+'.count'] = judgments[col].apply(lambda x: sum(x.values()))	
215 1
		judgments[col+'.unique'] = judgments[col].apply(lambda x: len(x))	
216
217
218
	#
219
	# aggregate workers
220
	#
221 1
	workers = Worker.aggregate(judgments, config)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable Worker does not seem to be defined.
Loading history...
222
223
224
	#
225
	# aggregate annotations
226
	# i.e. output columns
227
	#	
228 1
	annotations = pd.DataFrame()
229 1
	for col in config.output.values():
230 1
		res = pd.DataFrame(judgments[col].apply(lambda x: pd.Series(list(x.keys())).value_counts()).sum(),columns=[col])
231 1
		annotations = pd.concat([annotations, res], axis=0)
232
	
233
	#
234
	# aggregate job
235
	#
236 1
	job = Job.aggregate(units, judgments, workers, config)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable Job does not seem to be defined.
Loading history...
237
238
	# Clean up judgments
239
	# remove input columns from judgments
240 1
	outputCol = [col for col in judgments.columns.values if col.startswith('output') or col.startswith('metric')]
241 1
	judgments = judgments[outputCol + list(platform.values()) + ['duration','job']]
242
	
243
	# set judgment id as index
244 1
	judgments.set_index('judgment', inplace=True)
245
246
	# add missing vector values if closed task
247 1
	for col in config.output.values():
248 1
		try:
249 1
			openended = config.open_ended_task
250 1
			for idx in list(units.index):
251 1
				for relation in config.annotation_vector:
252 1
					if relation not in units[col][idx]:
253 1
						units[col][idx].update({relation : 0})	
254
		except AttributeError:
255
			continue
256
257 1
	return {
258
		'jobs' : job, 
259
		'units' : units,
260
		'workers' : workers,
261
		'judgments' : judgments,
262
		'annotations' : annotations,
263
		}, config
264
265
266 1
def getPlatform(df):
267
	# Get the crowdsourcing platform this file originates to
268
269 1
	if df.columns.values[0] == '_unit_id':
270
		# CrowdFlower
271 1
		return {
272
			#'_platform'		: 'cf',
273
			'_id' 			: 'judgment',
274
			'_unit_id' 		: 'unit',
275
			'_worker_id' 	: 'worker',
276
			'_started_at'	: 'started',
277
			'_created_at'	: 'submitted'
278
		}
279
	elif df.columns.values[0] == 'HITId':
280
		# Mturk
281
		return {
282
			#'id'		: 'amt',
283
			'AssignmentId' 	: 'judgment',
284
			'HITId' 		: 'unit',
285
			'WorkerId' 		: 'worker',
286
			'AcceptTime'	: 'started',
287
			'SubmitTime'	: 'submitted'
288
		}
289
	else:
290
		
291
		return False
292
293
294
295 1
def getColumnTypes(df, config):
296
297
	# returns a list of columns that contain are input content
298 1
	config.input = {}
299 1
	config.output = {}
300
301
	# get a dict of the columns with input content and the columns with output judgments
302
	# each entry matches [original column name]:[safestring column name]
303 1
	if df.columns.values[0] == 'HITId':
304
		# Mturk
305
		# if config is specified, use those columns
306
		if config.inputColumns:
307
			config.input = {c:'input.'+c.replace('Input.','') for c in df.columns.values if c in config.inputColumns}
308
		else:
309
			config.input = {c:'input.'+c.replace('Input.','') for c in df.columns.values if c.startswith('Input.')}
310
		
311
		# if config is specified, use those columns
312
		if config.outputColumns:
313
			config.output = {c:'output.'+c.replace('Answer.','') for c in df.columns.values if c in config.outputColumns}
314
		else:
315
			config.output = {c:'output.'+c.replace('Answer.','') for c in df.columns.values if c.startswith('Answer.')}
316
		return config
317
318 1
	elif df.columns.values[0] == '_unit_id':
319
320
		# if a config is specified, use those columns
321 1
		if config.inputColumns:
322 1
			config.input = {c:'input.'+c for c in df.columns.values if c in config.inputColumns}
323 1
		if config.outputColumns:
324 1
			config.output = {c:'output.'+c for c in df.columns.values if c in config.outputColumns}
325
		# if there is a config for both input and output columns, we can return those
326 1
		if config.inputColumns and config.outputColumns:
327 1
			return config
328
329
		# try to identify the input and output columns
330
		# this is the case if all the values in the column are identical
331
		# this is not failsafe but should give decent results without settings
332
		# it is best to make a settings.py file for a collection
333
334
		units = df.groupby('_unit_id')
335
		columns = [c for c in df.columns.values if c != 'clustering' and not c.startswith('_') and not c.startswith('e_') and not c.endswith('_gold') and not c.endswith('_reason') and not c.endswith('browser')]
336
		for c in columns:
337
			try:
338
				for i, unit in units:
339
					unique = unit[c].nunique()
340
					if unique != 1 and unique != 0:
341
						raise Found
342
				if not config.inputColumns:
343
					config.input[c] = 'input.'+c
344
345
			except Found:
346
				if not config.outputColumns:
347
					config.output[c] = 'output.'+c
348
349
		return config
350
	else:
351
		# unknown platform type
352
353
		# if a config is specified, use those columns
354
		if config.inputColumns:
355
			config.input = {c:'input.'+c for c in df.columns.values if c in config.inputColumns}
356
		if config.outputColumns:
357
			config.output = {c:'output.'+c for c in df.columns.values if c in config.outputColumns}
358
		# if there is a config for both input and output columns, we can return those
359
		if config.inputColumns and config.outputColumns:
360
			return config
361