Conditions | 16 |
Total Lines | 118 |
Code Lines | 60 |
Lines | 0 |
Ratio | 0 % |
Tests | 37 |
CRAP Score | 17.4389 |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like crowdtruth.load.process_file() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | #pylint: disable=W0223 |
||
172 | 1 | def process_file(filename, config): |
|
173 | """ process input files with the given configuration""" |
||
174 | |||
175 | 1 | judgments = pd.read_csv(filename)#, encoding=result['encoding']) |
|
176 | |||
177 | 1 | platform = get_platform(judgments) |
|
178 | |||
179 | 1 | if platform is False: |
|
180 | logging.info("Custom crowdsourcing platform!") |
||
181 | no_of_columns = len(config.customPlatformColumns) |
||
182 | if no_of_columns != 5: |
||
183 | logging.warning("The following column names are required: judgment id, " |
||
184 | "unit id, worker id, start time, submit time") |
||
185 | raise ValueError('No custom platform configuration was provided') |
||
186 | else: |
||
187 | |||
188 | platform = { |
||
189 | #'id' : 'custom', |
||
190 | config.customPlatformColumns[0] : 'judgment', |
||
191 | config.customPlatformColumns[1] : 'unit', |
||
192 | config.customPlatformColumns[2] : 'worker', |
||
193 | config.customPlatformColumns[3] : 'started', |
||
194 | config.customPlatformColumns[4] : 'submitted' |
||
195 | } |
||
196 | |||
197 | |||
198 | # we must establish which fields were part of the input data and which are output judgments |
||
199 | # if there is a config, check if there is a definition of which fields to use |
||
200 | #config = [] |
||
201 | # else use the default and select them automatically |
||
202 | 1 | config = get_column_types(judgments, config) |
|
203 | |||
204 | 1 | judgments = remove_empty_rows(config, judgments) |
|
205 | # allow customization of the judgments |
||
206 | 1 | judgments = config.processJudgments(judgments) |
|
207 | |||
208 | # update the config after the preprocessing of judgments |
||
209 | 1 | config = get_column_types(judgments, config) |
|
210 | |||
211 | 1 | all_columns = dict(list(config.input.items()) + list(config.output.items()) \ |
|
212 | + list(platform.items())) |
||
213 | # allColumns = dict(config.input.items() | config.output.items() | platform.items()) |
||
214 | 1 | judgments = judgments.rename(columns=all_columns) |
|
215 | |||
216 | # remove columns we don't care about |
||
217 | 1 | judgments = judgments[list(all_columns.values())] |
|
218 | |||
219 | 1 | judgments['job'] = filename.split('.csv')[0] |
|
220 | |||
221 | # make output values safe keys |
||
222 | 1 | judgments = make_output_cols_safe_keys(config, judgments) |
|
223 | |||
224 | 1 | judgments['started'] = judgments['started'].apply(lambda x: pd.to_datetime(str(x))) |
|
225 | 1 | judgments['submitted'] = judgments['submitted'].apply(lambda x: pd.to_datetime(str(x))) |
|
226 | 1 | judgments['duration'] = judgments.apply(lambda row: (row['submitted'] - row['started']).seconds, |
|
227 | axis=1) |
||
228 | |||
229 | # remove units with just 1 judgment |
||
230 | 1 | judgments = remove_single_judgment_units(judgments) |
|
231 | |||
232 | # |
||
233 | # aggregate units |
||
234 | # |
||
235 | 1 | units = Unit.aggregate(judgments, config) |
|
236 | |||
237 | 1 | for col in config.output.values(): |
|
238 | 1 | judgments[col+'.count'] = judgments[col].apply(lambda x: sum(x.values())) |
|
239 | 1 | judgments[col+'.unique'] = judgments[col].apply(lambda x: len(x)) |
|
240 | |||
241 | |||
242 | # |
||
243 | # aggregate workers |
||
244 | # |
||
245 | 1 | workers = Worker.aggregate(judgments, config) |
|
246 | |||
247 | |||
248 | # |
||
249 | # aggregate annotations |
||
250 | # i.e. output columns |
||
251 | # |
||
252 | 1 | annotations = pd.DataFrame() |
|
253 | 1 | for col in config.output.values(): |
|
254 | 1 | res = pd.DataFrame(judgments[col].apply(lambda x: \ |
|
255 | pd.Series(list(x.keys())).value_counts()).sum(), columns=[col]) |
||
256 | 1 | annotations = pd.concat([annotations, res], axis=0) |
|
257 | |||
258 | # |
||
259 | # aggregate job |
||
260 | # |
||
261 | 1 | job = Job.aggregate(units, judgments, config) |
|
262 | |||
263 | # Clean up judgments |
||
264 | # remove input columns from judgments |
||
265 | 1 | output_cols = [col for col in judgments.columns.values \ |
|
266 | if col.startswith('output') or col.startswith('metric')] |
||
267 | 1 | judgments = judgments[output_cols + list(platform.values()) + ['duration', 'job']] |
|
268 | |||
269 | # set judgment id as index |
||
270 | 1 | judgments.set_index('judgment', inplace=True) |
|
271 | |||
272 | # add missing vector values if closed task |
||
273 | 1 | for col in config.output.values(): |
|
274 | 1 | try: |
|
275 | # openended = config.open_ended_task |
||
276 | 1 | for idx in list(units.index): |
|
277 | 1 | for relation in config.annotation_vector: |
|
278 | 1 | if relation not in units[col][idx]: |
|
279 | 1 | units[col][idx].update({relation : 0}) |
|
280 | except AttributeError: |
||
281 | continue |
||
282 | |||
283 | 1 | return { |
|
284 | 'jobs' : job, |
||
285 | 'units' : units, |
||
286 | 'workers' : workers, |
||
287 | 'judgments' : judgments, |
||
288 | 'annotations' : annotations, |
||
289 | }, config |
||
290 | |||
394 |