| Conditions | 31 |
| Total Lines | 148 |
| Code Lines | 82 |
| Lines | 0 |
| Ratio | 0 % |
| Tests | 54 |
| CRAP Score | 44.1854 |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like crowdtruth.load.processFile() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | 1 | import os |
|
| 116 | 1 | def processFile(filename, config): |
|
| 117 | |||
| 118 | |||
| 119 | 1 | job = filename.split('.csv')[0] |
|
| 120 | |||
| 121 | 1 | judgments = pd.read_csv(filename)#, encoding=result['encoding']) |
|
| 122 | |||
| 123 | 1 | collection = '' |
|
| 124 | |||
| 125 | 1 | platform = getPlatform(judgments) |
|
| 126 | |||
| 127 | 1 | if platform == False: |
|
| 128 | logging.info("Custom crowdsourcing platform!") |
||
| 129 | |||
| 130 | if (len(config.customPlatformColumns) != 5): |
||
| 131 | logging.warning("The following column names are required: judgment id, unit id, worker id, start time, submit time") |
||
| 132 | raise ValueError('No custom platform configuration was provided') |
||
| 133 | else: |
||
| 134 | |||
| 135 | platform = { |
||
| 136 | #'id' : 'custom', |
||
| 137 | config.customPlatformColumns[0] : 'judgment', |
||
| 138 | config.customPlatformColumns[1] : 'unit', |
||
| 139 | config.customPlatformColumns[2] : 'worker', |
||
| 140 | config.customPlatformColumns[3] : 'started', |
||
| 141 | config.customPlatformColumns[4] : 'submitted' |
||
| 142 | } |
||
| 143 | |||
| 144 | |||
| 145 | # we must establish which fields were part of the input data and which are output judgments |
||
| 146 | # if there is a config, check if there is a definition of which fields to use |
||
| 147 | #config = [] |
||
| 148 | # else use the default and select them automatically |
||
| 149 | 1 | config = getColumnTypes(judgments, config) |
|
| 150 | |||
| 151 | # remove rows where the worker did not give an answer (AMT issue) |
||
| 152 | 1 | empty_rows = set() |
|
| 153 | 1 | for col in config.outputColumns: |
|
| 154 | 1 | empty_rows = empty_rows.union(judgments[pd.isnull(judgments[col]) == True].index) |
|
| 155 | 1 | for col in config.outputColumns: |
|
| 156 | 1 | judgments = judgments[pd.isnull(judgments[col]) == False] |
|
| 157 | 1 | judgments = judgments.reset_index(drop=True) |
|
| 158 | 1 | if len(empty_rows) > 0: |
|
| 159 | if len(empty_rows) == 1: |
||
| 160 | logging.warning(str(len(empty_rows)) + " row with incomplete information in output columns was removed.") |
||
| 161 | else: |
||
| 162 | logging.warning(str(len(empty_rows)) + " rows with incomplete information in output columns were removed.") |
||
| 163 | |||
| 164 | # allow customization of the judgments |
||
| 165 | 1 | judgments = config.processJudgments(judgments) |
|
| 166 | |||
| 167 | # update the config after the preprocessing of judgments |
||
| 168 | 1 | config = getColumnTypes(judgments, config) |
|
| 169 | |||
| 170 | 1 | allColumns = dict(list(config.input.items()) + list(config.output.items()) + list(platform.items())) |
|
| 171 | # allColumns = dict(config.input.items() | config.output.items() | platform.items()) |
||
| 172 | 1 | judgments = judgments.rename(columns=allColumns) |
|
| 173 | |||
| 174 | # remove columns we don't care about |
||
| 175 | 1 | judgments = judgments[list(allColumns.values())] |
|
| 176 | |||
| 177 | 1 | judgments['job'] = job |
|
| 178 | |||
| 179 | # make output values safe keys |
||
| 180 | 1 | for col in config.output.values(): |
|
| 181 | 1 | if type(judgments[col].iloc[0]) is dict: |
|
| 182 | logging.info("Values stored as dictionary") |
||
| 183 | if config.open_ended_task: |
||
| 184 | judgments[col] = judgments[col].apply(lambda x: OrderedCounter(x)) |
||
| 185 | else: |
||
| 186 | judgements[col] = judgements[col].apply(lambda x: createOrderedCounter(OrderedCounter(x), config.annotation_vector)) |
||
|
|
|||
| 187 | else: |
||
| 188 | 1 | logging.info("Values not stored as dictionary") |
|
| 189 | 1 | if config.open_ended_task: |
|
| 190 | 1 | judgments[col] = judgments[col].apply(lambda x: OrderedCounter(x.split(config.annotation_separator))) |
|
| 191 | else: |
||
| 192 | 1 | judgments[col] = judgments[col].apply(lambda x: createOrderedCounter(OrderedCounter(x.split(config.annotation_separator)), config.annotation_vector)) |
|
| 193 | |||
| 194 | 1 | judgments['started'] = judgments['started'].apply(lambda x: pd.to_datetime(str(x))) |
|
| 195 | 1 | judgments['submitted'] = judgments['submitted'].apply(lambda x: pd.to_datetime(str(x))) |
|
| 196 | 1 | judgments['duration'] = judgments.apply(lambda row: (row['submitted'] - row['started']).seconds, axis=1) |
|
| 197 | |||
| 198 | # remove units with just 1 judgment |
||
| 199 | 1 | units_1work = judgments.groupby('unit').filter(lambda x: len(x) == 1)["unit"] |
|
| 200 | 1 | judgments = judgments[~judgments['unit'].isin(units_1work)] |
|
| 201 | 1 | judgments = judgments.reset_index(drop=True) |
|
| 202 | 1 | if len(units_1work) > 0: |
|
| 203 | if len(units_1work) == 1: |
||
| 204 | logging.warning(str(len(units_1work)) + " Media Unit that was annotated by only 1 Worker was omitted, since agreement cannot be calculated.") |
||
| 205 | else: |
||
| 206 | logging.warning(str(len(units_1work)) + " Media Units that were annotated by only 1 Worker were omitted, since agreement cannot be calculated.") |
||
| 207 | |||
| 208 | # |
||
| 209 | # aggregate units |
||
| 210 | # |
||
| 211 | 1 | units = Unit.aggregate(judgments, config) |
|
| 212 | |||
| 213 | 1 | for col in config.output.values(): |
|
| 214 | 1 | judgments[col+'.count'] = judgments[col].apply(lambda x: sum(x.values())) |
|
| 215 | 1 | judgments[col+'.unique'] = judgments[col].apply(lambda x: len(x)) |
|
| 216 | |||
| 217 | |||
| 218 | # |
||
| 219 | # aggregate workers |
||
| 220 | # |
||
| 221 | 1 | workers = Worker.aggregate(judgments, config) |
|
| 222 | |||
| 223 | |||
| 224 | # |
||
| 225 | # aggregate annotations |
||
| 226 | # i.e. output columns |
||
| 227 | # |
||
| 228 | 1 | annotations = pd.DataFrame() |
|
| 229 | 1 | for col in config.output.values(): |
|
| 230 | 1 | res = pd.DataFrame(judgments[col].apply(lambda x: pd.Series(list(x.keys())).value_counts()).sum(),columns=[col]) |
|
| 231 | 1 | annotations = pd.concat([annotations, res], axis=0) |
|
| 232 | |||
| 233 | # |
||
| 234 | # aggregate job |
||
| 235 | # |
||
| 236 | 1 | job = Job.aggregate(units, judgments, workers, config) |
|
| 237 | |||
| 238 | # Clean up judgments |
||
| 239 | # remove input columns from judgments |
||
| 240 | 1 | outputCol = [col for col in judgments.columns.values if col.startswith('output') or col.startswith('metric')] |
|
| 241 | 1 | judgments = judgments[outputCol + list(platform.values()) + ['duration','job']] |
|
| 242 | |||
| 243 | # set judgment id as index |
||
| 244 | 1 | judgments.set_index('judgment', inplace=True) |
|
| 245 | |||
| 246 | # add missing vector values if closed task |
||
| 247 | 1 | for col in config.output.values(): |
|
| 248 | 1 | try: |
|
| 249 | 1 | openended = config.open_ended_task |
|
| 250 | 1 | for idx in list(units.index): |
|
| 251 | 1 | for relation in config.annotation_vector: |
|
| 252 | 1 | if relation not in units[col][idx]: |
|
| 253 | 1 | units[col][idx].update({relation : 0}) |
|
| 254 | except AttributeError: |
||
| 255 | continue |
||
| 256 | |||
| 257 | 1 | return { |
|
| 258 | 'jobs' : job, |
||
| 259 | 'units' : units, |
||
| 260 | 'workers' : workers, |
||
| 261 | 'judgments' : judgments, |
||
| 262 | 'annotations' : annotations, |
||
| 263 | }, config |
||
| 264 | |||
| 361 |