| Total Complexity | 69 |
| Total Lines | 306 |
| Duplicated Lines | 0 % |
Complex classes like pyspider.libs.BaseHandler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | #!/usr/bin/env python |
||
| 122 | @add_metaclass(BaseHandlerMeta) |
||
| 123 | class BaseHandler(object): |
||
| 124 | """ |
||
| 125 | BaseHandler for all scripts. |
||
| 126 | |||
| 127 | `BaseHandler.run` is the main method to handler the task. |
||
| 128 | """ |
||
| 129 | crawl_config = {} |
||
| 130 | project_name = None |
||
| 131 | _cron_jobs = [] |
||
| 132 | _min_tick = 0 |
||
| 133 | __env__ = {'not_inited': True} |
||
| 134 | retry_delay = {} |
||
| 135 | |||
| 136 | def _reset(self): |
||
| 137 | """ |
||
| 138 | reset before each task |
||
| 139 | """ |
||
| 140 | self._extinfo = {} |
||
| 141 | self._messages = [] |
||
| 142 | self._follows = [] |
||
| 143 | self._follows_keys = set() |
||
| 144 | |||
| 145 | def _run_func(self, function, *arguments): |
||
| 146 | """ |
||
| 147 | Running callback function with requested number of arguments |
||
| 148 | """ |
||
| 149 | args, varargs, keywords, defaults = inspect.getargspec(function) |
||
| 150 | return function(*arguments[:len(args) - 1]) |
||
| 151 | |||
| 152 | def _run_task(self, task, response): |
||
| 153 | """ |
||
| 154 | Finding callback specified by `task['callback']` |
||
| 155 | raising status error for it if needed. |
||
| 156 | """ |
||
| 157 | process = task.get('process', {}) |
||
| 158 | callback = process.get('callback', '__call__') |
||
| 159 | if not hasattr(self, callback): |
||
| 160 | raise NotImplementedError("self.%s() not implemented!" % callback) |
||
| 161 | |||
| 162 | function = getattr(self, callback) |
||
| 163 | # do not run_func when 304 |
||
| 164 | if response.status_code == 304 and not getattr(function, '_catch_status_code_error', False): |
||
| 165 | return None |
||
| 166 | if not getattr(function, '_catch_status_code_error', False): |
||
| 167 | response.raise_for_status() |
||
| 168 | return self._run_func(function, response, task) |
||
| 169 | |||
| 170 | def run_task(self, module, task, response): |
||
| 171 | """ |
||
| 172 | Processing the task, catching exceptions and logs, return a `ProcessorResult` object |
||
| 173 | """ |
||
| 174 | logger = module.logger |
||
| 175 | result = None |
||
| 176 | exception = None |
||
| 177 | stdout = sys.stdout |
||
| 178 | self.task = task |
||
| 179 | if isinstance(response, dict): |
||
| 180 | response = rebuild_response(response) |
||
| 181 | self.response = response |
||
| 182 | self.save = (task.get('track') or {}).get('save', {}) |
||
| 183 | |||
| 184 | try: |
||
| 185 | if self.__env__.get('enable_stdout_capture', True): |
||
| 186 | sys.stdout = ListO(module.log_buffer) |
||
| 187 | self._reset() |
||
| 188 | result = self._run_task(task, response) |
||
| 189 | if inspect.isgenerator(result): |
||
| 190 | for r in result: |
||
| 191 | self._run_func(self.on_result, r, response, task) |
||
| 192 | else: |
||
| 193 | self._run_func(self.on_result, result, response, task) |
||
| 194 | except Exception as e: |
||
| 195 | logger.exception(e) |
||
| 196 | exception = e |
||
| 197 | finally: |
||
| 198 | follows = self._follows |
||
| 199 | messages = self._messages |
||
| 200 | logs = list(module.log_buffer) |
||
| 201 | extinfo = self._extinfo |
||
| 202 | save = self.save |
||
| 203 | |||
| 204 | sys.stdout = stdout |
||
| 205 | self.task = None |
||
| 206 | self.response = None |
||
| 207 | self.save = None |
||
| 208 | |||
| 209 | module.log_buffer[:] = [] |
||
| 210 | return ProcessorResult(result, follows, messages, logs, exception, extinfo, save) |
||
| 211 | |||
| 212 | def _crawl(self, url, **kwargs): |
||
| 213 | """ |
||
| 214 | real crawl API |
||
| 215 | |||
| 216 | checking kwargs, and repack them to each sub-dict |
||
| 217 | """ |
||
| 218 | task = {} |
||
| 219 | |||
| 220 | assert len(url) < 1024, "Maximum (1024) URL length error." |
||
| 221 | |||
| 222 | if kwargs.get('callback'): |
||
| 223 | callback = kwargs['callback'] |
||
| 224 | if isinstance(callback, six.string_types) and hasattr(self, callback): |
||
| 225 | func = getattr(self, callback) |
||
| 226 | elif six.callable(callback) and six.get_method_self(callback) is self: |
||
| 227 | func = callback |
||
| 228 | kwargs['callback'] = func.__name__ |
||
| 229 | else: |
||
| 230 | raise NotImplementedError("self.%s() not implemented!" % callback) |
||
| 231 | if hasattr(func, '_config'): |
||
| 232 | for k, v in iteritems(func._config): |
||
| 233 | if isinstance(v, dict) and isinstance(kwargs.get(k), dict): |
||
| 234 | kwargs[k].update(v) |
||
| 235 | else: |
||
| 236 | kwargs.setdefault(k, v) |
||
| 237 | |||
| 238 | for k, v in iteritems(self.crawl_config): |
||
| 239 | if isinstance(v, dict) and isinstance(kwargs.get(k), dict): |
||
| 240 | kwargs[k].update(v) |
||
| 241 | else: |
||
| 242 | kwargs.setdefault(k, v) |
||
| 243 | |||
| 244 | url = quote_chinese(_build_url(url.strip(), kwargs.pop('params', None))) |
||
| 245 | if kwargs.get('files'): |
||
| 246 | assert isinstance( |
||
| 247 | kwargs.get('data', {}), dict), "data must be a dict when using with files!" |
||
| 248 | content_type, data = _encode_multipart_formdata(kwargs.pop('data', {}), |
||
| 249 | kwargs.pop('files', {})) |
||
| 250 | kwargs.setdefault('headers', {}) |
||
| 251 | kwargs['headers']['Content-Type'] = content_type |
||
| 252 | kwargs['data'] = data |
||
| 253 | if kwargs.get('data'): |
||
| 254 | kwargs['data'] = _encode_params(kwargs['data']) |
||
| 255 | if kwargs.get('data'): |
||
| 256 | kwargs.setdefault('method', 'POST') |
||
| 257 | |||
| 258 | schedule = {} |
||
| 259 | for key in ('priority', 'retries', 'exetime', 'age', 'itag', 'force_update', |
||
| 260 | 'auto_recrawl'): |
||
| 261 | if key in kwargs: |
||
| 262 | schedule[key] = kwargs.pop(key) |
||
| 263 | task['schedule'] = schedule |
||
| 264 | |||
| 265 | fetch = {} |
||
| 266 | for key in ( |
||
| 267 | 'method', |
||
| 268 | 'headers', |
||
| 269 | 'data', |
||
| 270 | 'timeout', |
||
| 271 | 'allow_redirects', |
||
| 272 | 'cookies', |
||
| 273 | 'proxy', |
||
| 274 | 'etag', |
||
| 275 | 'last_modifed', |
||
| 276 | 'save', |
||
| 277 | 'js_run_at', |
||
| 278 | 'js_script', |
||
| 279 | 'js_viewport_width', |
||
| 280 | 'js_viewport_height', |
||
| 281 | 'load_images', |
||
| 282 | 'fetch_type', |
||
| 283 | 'use_gzip', |
||
| 284 | 'validate_cert', |
||
| 285 | 'max_redirects', |
||
| 286 | 'robots_txt' |
||
| 287 | ): |
||
| 288 | if key in kwargs: |
||
| 289 | fetch[key] = kwargs.pop(key) |
||
| 290 | task['fetch'] = fetch |
||
| 291 | |||
| 292 | process = {} |
||
| 293 | for key in ('callback', ): |
||
| 294 | if key in kwargs: |
||
| 295 | process[key] = kwargs.pop(key) |
||
| 296 | task['process'] = process |
||
| 297 | |||
| 298 | task['project'] = self.project_name |
||
| 299 | task['url'] = url |
||
| 300 | if 'taskid' in kwargs: |
||
| 301 | task['taskid'] = kwargs.pop('taskid') |
||
| 302 | else: |
||
| 303 | task['taskid'] = self.get_taskid(task) |
||
| 304 | |||
| 305 | if kwargs: |
||
| 306 | raise TypeError('crawl() got unexpected keyword argument: %s' % kwargs.keys()) |
||
| 307 | |||
| 308 | cache_key = "%(project)s:%(taskid)s" % task |
||
| 309 | if cache_key not in self._follows_keys: |
||
| 310 | self._follows_keys.add(cache_key) |
||
| 311 | self._follows.append(task) |
||
| 312 | return task |
||
| 313 | |||
| 314 | def get_taskid(self, task): |
||
| 315 | '''Generate taskid by information of task md5(url) by default, override me''' |
||
| 316 | return md5string(task['url']) |
||
| 317 | |||
| 318 | # apis |
||
| 319 | def crawl(self, url, **kwargs): |
||
| 320 | ''' |
||
| 321 | available params: |
||
| 322 | url |
||
| 323 | callback |
||
| 324 | |||
| 325 | method |
||
| 326 | params |
||
| 327 | data |
||
| 328 | files |
||
| 329 | headers |
||
| 330 | timeout |
||
| 331 | allow_redirects |
||
| 332 | cookies |
||
| 333 | proxy |
||
| 334 | etag |
||
| 335 | last_modifed |
||
| 336 | auto_recrawl |
||
| 337 | |||
| 338 | fetch_type |
||
| 339 | js_run_at |
||
| 340 | js_script |
||
| 341 | js_viewport_width |
||
| 342 | js_viewport_height |
||
| 343 | load_images |
||
| 344 | |||
| 345 | priority |
||
| 346 | retries |
||
| 347 | exetime |
||
| 348 | age |
||
| 349 | itag |
||
| 350 | |||
| 351 | save |
||
| 352 | taskid |
||
| 353 | |||
| 354 | full documents: http://pyspider.readthedocs.org/en/latest/apis/self.crawl/ |
||
| 355 | ''' |
||
| 356 | |||
| 357 | if isinstance(url, six.string_types) and url.startswith('curl '): |
||
| 358 | curl_kwargs = curl_to_arguments(url) |
||
| 359 | url = curl_kwargs.pop('urls') |
||
| 360 | for k, v in iteritems(curl_kwargs): |
||
| 361 | kwargs.setdefault(k, v) |
||
| 362 | |||
| 363 | if isinstance(url, six.string_types): |
||
| 364 | return self._crawl(url, **kwargs) |
||
| 365 | elif hasattr(url, "__iter__"): |
||
| 366 | result = [] |
||
| 367 | for each in url: |
||
| 368 | result.append(self._crawl(each, **kwargs)) |
||
| 369 | return result |
||
| 370 | |||
| 371 | def is_debugger(self): |
||
| 372 | """Return true if running in debugger""" |
||
| 373 | return self.__env__.get('debugger') |
||
| 374 | |||
| 375 | def send_message(self, project, msg, url='data:,on_message'): |
||
| 376 | """Send messages to other project.""" |
||
| 377 | self._messages.append((project, msg, url)) |
||
| 378 | |||
| 379 | def on_message(self, project, msg): |
||
| 380 | """Receive message from other project, override me.""" |
||
| 381 | pass |
||
| 382 | |||
| 383 | def on_result(self, result): |
||
| 384 | """Receiving returns from other callback, override me.""" |
||
| 385 | if not result: |
||
| 386 | return |
||
| 387 | assert self.task, "on_result can't outside a callback." |
||
| 388 | if self.is_debugger(): |
||
| 389 | pprint(result) |
||
| 390 | if self.__env__.get('result_queue'): |
||
| 391 | self.__env__['result_queue'].put((self.task, result)) |
||
| 392 | |||
| 393 | @not_send_status |
||
| 394 | def _on_message(self, response): |
||
| 395 | project, msg = response.save |
||
| 396 | return self.on_message(project, msg) |
||
| 397 | |||
| 398 | @not_send_status |
||
| 399 | def _on_cronjob(self, response, task): |
||
| 400 | if (not response.save |
||
| 401 | or not isinstance(response.save, dict) |
||
| 402 | or 'tick' not in response.save): |
||
| 403 | return |
||
| 404 | |||
| 405 | # When triggered, a '_on_cronjob' task is sent from scheudler with 'tick' in |
||
| 406 | # Response.save. Scheduler may at least send the trigger task every GCD of the |
||
| 407 | # inverval of the cronjobs. The method should check the tick for each cronjob |
||
| 408 | # function to confirm the execute interval. |
||
| 409 | for cronjob in self._cron_jobs: |
||
| 410 | if response.save['tick'] % cronjob.tick != 0: |
||
| 411 | continue |
||
| 412 | function = cronjob.__get__(self, self.__class__) |
||
| 413 | self._run_func(function, response, task) |
||
| 414 | |||
| 415 | def _on_get_info(self, response, task): |
||
| 416 | """Sending runtime infomation about this script.""" |
||
| 417 | for each in response.save or []: |
||
| 418 | if each == 'min_tick': |
||
| 419 | self.save[each] = self._min_tick |
||
| 420 | elif each == 'retry_delay': |
||
| 421 | if not isinstance(self.retry_delay, dict): |
||
| 422 | self.retry_delay = {'': self.retry_delay} |
||
| 423 | self.save[each] = self.retry_delay |
||
| 424 | |||
| 425 | @not_send_status |
||
| 426 | def on_finished(self, response, task): |
||
| 427 | pass |
||
| 428 |