Complex classes like Crawl often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Crawl, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 17 | class Crawl |
||
| 18 | { |
||
| 19 | use DiffbotAware; |
||
| 20 | |||
| 21 | /** @var string API URL to which to send the request */ |
||
| 22 | protected $apiUrl = 'https://api.diffbot.com/v3/crawl'; |
||
| 23 | |||
| 24 | /** @var string */ |
||
| 25 | protected $name; |
||
| 26 | |||
| 27 | /** @var Api Api which should be used to process the pages */ |
||
| 28 | protected $api; |
||
| 29 | |||
| 30 | /** @var array Options to set while initiating the API call */ |
||
| 31 | protected $otherOptions = []; |
||
| 32 | |||
| 33 | /** @var array Array of seed URLs to crawl */ |
||
| 34 | protected $seeds = []; |
||
| 35 | |||
| 36 | /** |
||
| 37 | * @see getName |
||
| 38 | * @param string|null $name |
||
| 39 | * @param null|Api $api |
||
| 40 | */ |
||
| 41 | 61 | public function __construct($name = null, Api $api = null) |
|
| 50 | |||
| 51 | /** |
||
| 52 | * Returns the unique name of the crawljob |
||
| 53 | * This name is later used to download datasets, or to modify the job |
||
| 54 | * |
||
| 55 | * @return string |
||
| 56 | */ |
||
| 57 | 45 | public function getName() |
|
| 61 | |||
| 62 | /** |
||
| 63 | * API which should be used to process the pages |
||
| 64 | * |
||
| 65 | * Accepts a fully formed instance of any other API. Will use it to build |
||
| 66 | * and auto-encode the URL. To satisfy the required $url param of the API |
||
| 67 | * classes, use the string 'crawl' which prepares the API for Crawlbot |
||
| 68 | * consumption internally. |
||
| 69 | * |
||
| 70 | * @see https://www.diffbot.com/dev/docs/crawl/api.jsp ApiUrl docs |
||
| 71 | * @param Api $api |
||
| 72 | * @return $this |
||
| 73 | */ |
||
| 74 | 1 | public function setApi(Api $api) |
|
| 80 | |||
| 81 | /** |
||
| 82 | * An array of URLs (seeds) which to crawl for matching links |
||
| 83 | * |
||
| 84 | * By default Crawlbot will restrict spidering to the entire domain |
||
| 85 | * ("http://blog.diffbot.com" will include URLs at "http://www.diffbot.com"). |
||
| 86 | * |
||
| 87 | * @param array $seeds |
||
| 88 | * @return $this |
||
| 89 | */ |
||
| 90 | 52 | public function setSeeds(array $seeds) |
|
| 108 | |||
| 109 | /** |
||
| 110 | * Array of strings to limit pages crawled to those whose URLs |
||
| 111 | * contain any of the content strings. |
||
| 112 | * |
||
| 113 | * You can use the exclamation point to specify a negative string, e.g. |
||
| 114 | * !product to exclude URLs containing the string "product," and the ^ and |
||
| 115 | * $ characters to limit matches to the beginning or end of the URL. |
||
| 116 | * |
||
| 117 | * The use of a urlCrawlPattern will allow Crawlbot to spider outside of |
||
| 118 | * the seed domain; it will follow all matching URLs regardless of domain. |
||
| 119 | * |
||
| 120 | * @param array $pattern |
||
| 121 | * @return $this |
||
| 122 | */ |
||
| 123 | 1 | public function setUrlCrawlPatterns(array $pattern = null) |
|
| 132 | |||
| 133 | /** |
||
| 134 | * Specify a regular expression to limit pages crawled to those URLs that |
||
| 135 | * match your expression. This will override any urlCrawlPattern value. |
||
| 136 | * |
||
| 137 | * The use of a urlCrawlRegEx will allow Crawlbot to spider outside of the |
||
| 138 | * seed domain; it will follow all matching URLs regardless of domain. |
||
| 139 | * |
||
| 140 | * @param string $regex |
||
| 141 | * @return $this |
||
| 142 | */ |
||
| 143 | 1 | public function setUrlCrawlRegEx($regex) |
|
| 149 | |||
| 150 | /** |
||
| 151 | * Specify ||-separated strings to limit pages processed to those whose |
||
| 152 | * URLs contain any of the content strings. |
||
| 153 | * |
||
| 154 | * You can use the exclamation point to specify a negative string, e.g. |
||
| 155 | * !/category to exclude URLs containing the string "/category," and the ^ |
||
| 156 | * and $ characters to limit matches to the beginning or end of the URL. |
||
| 157 | * |
||
| 158 | * @param array $pattern |
||
| 159 | * @return $this |
||
| 160 | */ |
||
| 161 | 1 | public function setUrlProcessPatterns(array $pattern = null) |
|
| 170 | |||
| 171 | /** |
||
| 172 | * Specify a regular expression to limit pages processed to those URLs that |
||
| 173 | * match your expression. This will override any urlProcessPattern value. |
||
| 174 | * |
||
| 175 | * @param string $regex |
||
| 176 | * @return $this |
||
| 177 | */ |
||
| 178 | 1 | public function setUrlProcessRegEx($regex) |
|
| 185 | |||
| 186 | /** |
||
| 187 | * Specify ||-separated strings to limit pages processed to those whose |
||
| 188 | * HTML contains any of the content strings. |
||
| 189 | * |
||
| 190 | * @param array $pattern |
||
| 191 | * @return $this |
||
| 192 | */ |
||
| 193 | 1 | public function setPageProcessPatterns(array $pattern) |
|
| 202 | |||
| 203 | /** |
||
| 204 | * Specify the depth of your crawl. A maxHops=0 will limit processing to |
||
| 205 | * the seed URL(s) only -- no other links will be processed; maxHops=1 will |
||
| 206 | * process all (otherwise matching) pages whose links appear on seed URL(s); |
||
| 207 | * maxHops=2 will process pages whose links appear on those pages; and so on |
||
| 208 | * |
||
| 209 | * By default, Crawlbot will crawl and process links at any depth. |
||
| 210 | * |
||
| 211 | * @param int $input |
||
| 212 | * @return $this |
||
| 213 | */ |
||
| 214 | 6 | public function setMaxHops($input = -1) |
|
| 223 | |||
| 224 | /** |
||
| 225 | * Specify max pages to spider. Default: 100,000. |
||
| 226 | * |
||
| 227 | * @param int $input |
||
| 228 | * @return $this |
||
| 229 | */ |
||
| 230 | 6 | public function setMaxToCrawl($input = 100000) |
|
| 239 | |||
| 240 | /** |
||
| 241 | * Specify max pages to process through Diffbot APIs. Default: 100,000. |
||
| 242 | * |
||
| 243 | * @param int $input |
||
| 244 | * @return $this |
||
| 245 | */ |
||
| 246 | 6 | public function setMaxToProcess($input = 100000) |
|
| 255 | |||
| 256 | /** |
||
| 257 | * If input is email address, end a message to this email address when the |
||
| 258 | * crawl hits the maxToCrawl or maxToProcess limit, or when the crawl |
||
| 259 | * completes. |
||
| 260 | * |
||
| 261 | * If input is URL, you will receive a POST with X-Crawl-Name and |
||
| 262 | * X-Crawl-Status in the headers, and the full JSON response in the |
||
| 263 | * POST body. |
||
| 264 | * |
||
| 265 | * @param string $string |
||
| 266 | * @return $this |
||
| 267 | * @throws InvalidArgumentException |
||
| 268 | */ |
||
| 269 | 6 | public function notify($string) |
|
| 286 | |||
| 287 | /** |
||
| 288 | * Wait this many seconds between each URL crawled from a single IP address. |
||
| 289 | * Specify the number of seconds as an integer or floating-point number. |
||
| 290 | * |
||
| 291 | * @param float $input |
||
| 292 | * @return $this |
||
| 293 | * @throws InvalidArgumentException |
||
| 294 | */ |
||
| 295 | 14 | public function setCrawlDelay($input = 0.25) |
|
| 305 | |||
| 306 | /** |
||
| 307 | * Specify the number of days as a floating-point (e.g. repeat=7.0) to |
||
| 308 | * repeat this crawl. By default crawls will not be repeated. |
||
| 309 | * |
||
| 310 | * @param int|float $input |
||
| 311 | * @return $this |
||
| 312 | * @throws \InvalidArgumentException |
||
| 313 | */ |
||
| 314 | 7 | public function setRepeat($input) |
|
| 323 | |||
| 324 | /** |
||
| 325 | * By default repeat crawls will only process new (previously unprocessed) |
||
| 326 | * pages. Set to 0 to process all content on repeat crawls. |
||
| 327 | * |
||
| 328 | * @param int $int |
||
| 329 | * @return $this |
||
| 330 | */ |
||
| 331 | 1 | public function setOnlyProcessIfNew($int = 1) |
|
| 337 | |||
| 338 | /** |
||
| 339 | * Specify the maximum number of crawl repeats. By default (maxRounds=0) |
||
| 340 | * repeating crawls will continue indefinitely. |
||
| 341 | * |
||
| 342 | * @param int $input |
||
| 343 | * @return $this |
||
| 344 | */ |
||
| 345 | 6 | public function setMaxRounds($input = 0) |
|
| 355 | |||
| 356 | /** |
||
| 357 | * Ignores robots.txt if set to 0/false |
||
| 358 | * |
||
| 359 | * @param bool $bool |
||
| 360 | * @return $this |
||
| 361 | */ |
||
| 362 | 1 | public function setObeyRobots($bool = true) |
|
| 368 | |||
| 369 | /** |
||
| 370 | * Set value to 1 to force the use of proxy IPs for the crawl. |
||
| 371 | * |
||
| 372 | * @param bool $bool |
||
| 373 | * @return $this |
||
| 374 | */ |
||
| 375 | public function setUseProxies($bool = true) |
||
| 381 | |||
| 382 | /** |
||
| 383 | * Force the start of a new crawl "round" (manually repeat the crawl). |
||
| 384 | * If onlyProcessIfNew is set to 1 (default), only newly-created pages will |
||
| 385 | * be processed. |
||
| 386 | * |
||
| 387 | * @param bool $commit |
||
| 388 | * @return EntityIterator |
||
| 389 | * @throws DiffbotException |
||
| 390 | */ |
||
| 391 | 1 | public function roundStart($commit = true) |
|
| 397 | |||
| 398 | /** |
||
| 399 | * Pause a crawl. |
||
| 400 | * |
||
| 401 | * @param bool $commit |
||
| 402 | * @return EntityIterator |
||
| 403 | * @throws DiffbotException |
||
| 404 | */ |
||
| 405 | 1 | public function pause($commit = true) |
|
| 411 | |||
| 412 | /** |
||
| 413 | * Pause a crawl. |
||
| 414 | * |
||
| 415 | * @param bool $commit |
||
| 416 | * @return EntityIterator |
||
| 417 | * @throws DiffbotException |
||
| 418 | */ |
||
| 419 | 1 | public function unpause($commit = true) |
|
| 425 | |||
| 426 | /** |
||
| 427 | * Restart removes all crawled data while maintaining crawl settings. |
||
| 428 | * |
||
| 429 | * @param bool $commit |
||
| 430 | * @return EntityIterator |
||
| 431 | * @throws DiffbotException |
||
| 432 | */ |
||
| 433 | 1 | public function restart($commit = true) |
|
| 439 | |||
| 440 | /** |
||
| 441 | * Delete a crawl, and all associated data, completely. |
||
| 442 | * |
||
| 443 | * @param bool $commit |
||
| 444 | * @return EntityIterator |
||
| 445 | * @throws DiffbotException |
||
| 446 | */ |
||
| 447 | 1 | public function delete($commit = true) |
|
| 472 | |||
| 473 | 7 | public function call() |
|
| 494 | |||
| 495 | /** |
||
| 496 | * Builds out the URL string that gets requested once `call()` is called |
||
| 497 | * |
||
| 498 | * @return string |
||
| 499 | */ |
||
| 500 | 45 | public function buildUrl() |
|
| 543 | |||
| 544 | /** |
||
| 545 | * Sets the request type to "urls" to retrieve the URL Report |
||
| 546 | * URL for understanding diagnostic data of URLs |
||
| 547 | * |
||
| 548 | * @return $this |
||
| 549 | */ |
||
| 550 | public function getUrlReportUrl($num = null) |
||
| 579 | |||
| 580 | /** |
||
| 581 | * @return string |
||
| 582 | */ |
||
| 583 | 44 | protected function getApiString() |
|
| 592 | } |
||
| 593 |