crowdtruth.crowd_platform.configure_amt_columns() - Code Metrics - CrowdTruth/CrowdTruth-core - Measure and Improve Code Quality continuously with Scrutinizer

crowdtruth.crowd_platform.configure_amt_columns() A
last analyzed 2019-05-10 09:08 UTC

↳ Parent: crowdtruth.crowd_platform

Complexity

Conditions

Size

Total Lines	20
Code Lines	14

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	8
CRAP Score	3.072

Importance

Changes

Metric	Value
eloc	14
dl	0
loc	20
ccs	8
cts	10
cp	0.8
rs	9.7
c	0
b	0
f	0
cc	3
nop	2
crap	3.072

"""
Module used to process information regarding the crowdsourcing platform.
"""
class Found(Exception):
    """ Exception. """
    pass

def get_platform(dframe):
    """ Get the crowdsourcing platform this file originates to """

    if dframe.columns.values[0] == '_unit_id':
        # CrowdFlower
        return {
            #'_platform'        : 'cf',
            '_id'           : 'judgment',
            '_unit_id'      : 'unit',
            '_worker_id'    : 'worker',
            '_started_at'   : 'started',
            '_created_at'   : 'submitted'
        }
    elif dframe.columns.values[0] == 'HITId':
        # Mturk
        return {
            #'id'       : 'amt',
            'AssignmentId'  : 'judgment',
            'HITId'         : 'unit',
            'WorkerId'      : 'worker',
            'AcceptTime'    : 'started',
            'SubmitTime'    : 'submitted'
        }
    return False

def configure_amt_columns(dframe, config):
    """ Configures AMT input and output columns. """
    config.input = {}
    config.output = {}

    if config.inputColumns:
        config.input = {c: 'input.'+c.replace('Input.', '') \
                        for c in dframe.columns.values if c in config.inputColumns}
    else:
        config.input = {c: 'input.'+c.replace('Input.', '') \
                        for c in dframe.columns.values if c.startswith('Input.')}

    # if config is specified, use those columns
    if config.outputColumns:
        config.output = {c: 'output.'+c.replace('Answer.', '') \
                         for c in dframe.columns.values if c in config.outputColumns}
    else:
        config.output = {c: 'output.'+c.replace('Answer.', '') \
                         for c in dframe.columns.values if c.startswith('Answer.')}
    return config.input, config.output

def configure_platform_columns(dframe, config):
    """ Configures FigureEight and custom platforms input and output columns. """
    config.input = {}
    config.output = {}

    if config.inputColumns:
        config.input = {c: 'input.'+c for c in dframe.columns.values \
                        if c in config.inputColumns}
    if config.outputColumns:
        config.output = {c: 'output.'+c for c in dframe.columns.values \
                         if c in config.outputColumns}
    return config.input, config.output

def configure_with_missing_columns(dframe, config):
    """ Identifies the type of the column based on naming """
    units = dframe.groupby('_unit_id')
    columns = [c for c in dframe.columns.values if c != 'clustering' and not c.startswith('_') \
                   and not c.startswith('e_') and not c.endswith('_gold') \
                   and not c.endswith('_reason') and not c.endswith('browser')]
    for colname in columns:
        try:
            for _, unit in units:
                unique = unit[colname].nunique()
                if unique != 1 and unique != 0:
                    raise Found
            if not config.inputColumns:
                config.input[colname] = 'input.'+colname

        except Found:
            if not config.outputColumns:
                config.output[colname] = 'output.'+colname

    return config

def get_column_types(dframe, config):
    """ return input and output columns """
    # returns a list of columns that contain are input content
    config.input = {}
    config.output = {}

    # get a dict of the columns with input content and the columns with output judgments
    # each entry matches [original column name]:[safestring column name]
    if dframe.columns.values[0] == 'HITId':
        # Mturk
        # if config is specified, use those columns
        config.input, config.output = configure_amt_columns(dframe, config)

        return config

    elif dframe.columns.values[0] == '_unit_id':

        # if a config is specified, use those columns
        config.input, config.output = configure_platform_columns(dframe, config)
        # if there is a config for both input and output columns, we can return those
        if config.inputColumns and config.outputColumns:
            return config

        # try to identify the input and output columns
        # this is the case if all the values in the column are identical
        # this is not failsafe but should give decent results without settings
        # it is best to make a settings.py file for a collection

        return configure_with_missing_columns(dframe, config)

    else:
        # unknown platform type

        # if a config is specified, use those columns
        config.input, config.output = configure_platform_columns(dframe, config)
        # if there is a config for both input and output columns, we can return those
        if config.inputColumns and config.outputColumns:
            return config


1		"""
2		Module used to process information regarding the crowdsourcing platform.
3		"""
4	1	class Found(Exception):
5		""" Exception. """
6	1	pass
7
8	1	def get_platform(dframe):
9		""" Get the crowdsourcing platform this file originates to """
10
11	1	if dframe.columns.values[0] == '_unit_id':
12		# CrowdFlower
13	1	return {
14		#'_platform' : 'cf',
15		'_id' : 'judgment',
16		'_unit_id' : 'unit',
17		'_worker_id' : 'worker',
18		'_started_at' : 'started',
19		'_created_at' : 'submitted'
20		}
21	1	elif dframe.columns.values[0] == 'HITId':
22		# Mturk
23	1	return {
24		#'id' : 'amt',
25		'AssignmentId' : 'judgment',
26		'HITId' : 'unit',
27		'WorkerId' : 'worker',
28		'AcceptTime' : 'started',
29		'SubmitTime' : 'submitted'
30		}
31	1	return False
32
33	1	def configure_amt_columns(dframe, config):
34		""" Configures AMT input and output columns. """
35	1	config.input = {}
36	1	config.output = {}
37
38	1	if config.inputColumns:
39	1	config.input = {c: 'input.'+c.replace('Input.', '') \
40		for c in dframe.columns.values if c in config.inputColumns}
41		else:
42		config.input = {c: 'input.'+c.replace('Input.', '') \
43		for c in dframe.columns.values if c.startswith('Input.')}
44
45		# if config is specified, use those columns
46	1	if config.outputColumns:
47	1	config.output = {c: 'output.'+c.replace('Answer.', '') \
48		for c in dframe.columns.values if c in config.outputColumns}
49		else:
50		config.output = {c: 'output.'+c.replace('Answer.', '') \
51		for c in dframe.columns.values if c.startswith('Answer.')}
52	1	return config.input, config.output
53
54	1	def configure_platform_columns(dframe, config):
55		""" Configures FigureEight and custom platforms input and output columns. """
56	1	config.input = {}
57	1	config.output = {}
58
59	1	if config.inputColumns:
60	1	config.input = {c: 'input.'+c for c in dframe.columns.values \
61		if c in config.inputColumns}
62	1	if config.outputColumns:
63	1	config.output = {c: 'output.'+c for c in dframe.columns.values \
64		if c in config.outputColumns}
65	1	return config.input, config.output
66
67	1	def configure_with_missing_columns(dframe, config):
68		""" Identifies the type of the column based on naming """
69		units = dframe.groupby('_unit_id')
70		columns = [c for c in dframe.columns.values if c != 'clustering' and not c.startswith('_') \
71		and not c.startswith('e_') and not c.endswith('_gold') \
72		and not c.endswith('_reason') and not c.endswith('browser')]
73		for colname in columns:
74		try:
75		for _, unit in units:
76		unique = unit[colname].nunique()
77		if unique != 1 and unique != 0:
78		raise Found
79		if not config.inputColumns:
80		config.input[colname] = 'input.'+colname
81
82		except Found:
83		if not config.outputColumns:
84		config.output[colname] = 'output.'+colname
85
86		return config
87
88	1	def get_column_types(dframe, config):
89		""" return input and output columns """
90		# returns a list of columns that contain are input content
91	1	config.input = {}
92	1	config.output = {}
93
94		# get a dict of the columns with input content and the columns with output judgments
95		# each entry matches [original column name]:[safestring column name]
96	1	if dframe.columns.values[0] == 'HITId':
97		# Mturk
98		# if config is specified, use those columns
99	1	config.input, config.output = configure_amt_columns(dframe, config)
100
101	1	return config
102
103	1	elif dframe.columns.values[0] == '_unit_id':
104
105		# if a config is specified, use those columns
106	1	config.input, config.output = configure_platform_columns(dframe, config)
107		# if there is a config for both input and output columns, we can return those
108	1	if config.inputColumns and config.outputColumns:
109	1	return config
110
111		# try to identify the input and output columns
112		# this is the case if all the values in the column are identical
113		# this is not failsafe but should give decent results without settings
114		# it is best to make a settings.py file for a collection
115
116		return configure_with_missing_columns(dframe, config)
117
118		else:
119		# unknown platform type
120
121		# if a config is specified, use those columns
122	1	config.input, config.output = configure_platform_columns(dframe, config)
123		# if there is a config for both input and output columns, we can return those
124	1	if config.inputColumns and config.outputColumns:
125		return config
126

CrowdTruth / CrowdTruth-core

GitHub Access Token became invalid

crowdtruth.crowd_platform.configure_amt_columns() A last analyzed 2019-05-10 09:08 UTC

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like

crowdtruth.crowd_platform.configure_amt_columns() A
last analyzed 2019-05-10 09:08 UTC