Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 21 | class Crawler |
||
| 22 | { |
||
| 23 | /** @var \GuzzleHttp\Client */ |
||
| 24 | protected $client; |
||
| 25 | |||
| 26 | /** @var \Psr\Http\Message\UriInterface */ |
||
| 27 | protected $baseUrl; |
||
| 28 | |||
| 29 | /** @var \Spatie\Crawler\CrawlObserverCollection */ |
||
| 30 | protected $crawlObservers; |
||
| 31 | |||
| 32 | /** @var \Spatie\Crawler\CrawlProfile */ |
||
| 33 | protected $crawlProfile; |
||
| 34 | |||
| 35 | /** @var int */ |
||
| 36 | protected $concurrency; |
||
| 37 | |||
| 38 | /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */ |
||
| 39 | protected $crawlQueue; |
||
| 40 | |||
| 41 | /** @var int */ |
||
| 42 | protected $crawledUrlCount = 0; |
||
| 43 | |||
| 44 | /** @var int|null */ |
||
| 45 | protected $maximumCrawlCount = null; |
||
| 46 | |||
| 47 | /** @var int */ |
||
| 48 | protected $maximumResponseSize = 1024 * 1024 * 2; |
||
| 49 | |||
| 50 | /** @var int|null */ |
||
| 51 | protected $maximumDepth = null; |
||
| 52 | |||
| 53 | /** @var int|null */ |
||
| 54 | protected $poolItemLimit = null; |
||
| 55 | |||
| 56 | /** @var bool */ |
||
| 57 | protected $respectRobots = true; |
||
| 58 | |||
| 59 | /** @var \Tree\Node\Node */ |
||
| 60 | protected $depthTree; |
||
| 61 | |||
| 62 | /** @var bool */ |
||
| 63 | protected $executeJavaScript = false; |
||
| 64 | |||
| 65 | /** @var Browsershot */ |
||
| 66 | protected $browsershot = null; |
||
| 67 | |||
| 68 | /** @var \Spatie\Robots\RobotsTxt */ |
||
| 69 | protected $robotsTxt = null; |
||
| 70 | |||
| 71 | /** @var string */ |
||
| 72 | protected $crawlRequestFulfilledClass; |
||
| 73 | |||
| 74 | /** @var string */ |
||
| 75 | protected $crawlRequestFailedClass; |
||
| 76 | |||
| 77 | /** @var float */ |
||
| 78 | protected $delayBetweenRequests = 0; |
||
| 79 | |||
| 80 | /** @var */ |
||
| 81 | protected static $defaultClientOptions = [ |
||
| 82 | RequestOptions::COOKIES => true, |
||
| 83 | RequestOptions::CONNECT_TIMEOUT => 10, |
||
| 84 | RequestOptions::TIMEOUT => 10, |
||
| 85 | RequestOptions::ALLOW_REDIRECTS => false, |
||
| 86 | ]; |
||
| 87 | |||
| 88 | /** @var array */ |
||
| 89 | protected $proxiesConfig = null; |
||
| 90 | |||
| 91 | /** @var bool */ |
||
| 92 | protected $usingProxies = false; |
||
| 93 | |||
| 94 | public static function create(array $clientOptions = []): Crawler |
||
| 104 | |||
| 105 | public function __construct(Client $client, int $concurrency = 10) |
||
| 121 | |||
| 122 | public function setConcurrency(int $concurrency): Crawler |
||
| 128 | |||
| 129 | public function setMaximumResponseSize(int $maximumResponseSizeInBytes): Crawler |
||
| 135 | |||
| 136 | public function getMaximumResponseSize(): ?int |
||
| 140 | |||
| 141 | public function setMaximumCrawlCount(int $maximumCrawlCount): Crawler |
||
| 147 | |||
| 148 | public function getMaximumCrawlCount(): ?int |
||
| 152 | |||
| 153 | public function getCrawlerUrlCount(): int |
||
| 157 | |||
| 158 | public function setMaximumDepth(int $maximumDepth): Crawler |
||
| 164 | |||
| 165 | public function getPoolItemLimit(): ?int |
||
| 169 | |||
| 170 | public function setPoolItemLimit(int $poolItemLimit): Crawler |
||
| 176 | |||
| 177 | public function getMaximumDepth(): ?int |
||
| 181 | |||
| 182 | public function setDelayBetweenRequests(int $delay): Crawler |
||
| 188 | |||
| 189 | public function getDelayBetweenRequests(): float |
||
| 193 | |||
| 194 | public function ignoreRobots(): Crawler |
||
| 200 | |||
| 201 | public function respectRobots(): Crawler |
||
| 207 | |||
| 208 | public function mustRespectRobots(): bool |
||
| 212 | |||
| 213 | public function getRobotsTxt(): RobotsTxt |
||
| 217 | |||
| 218 | public function setCrawlQueue(CrawlQueue $crawlQueue): Crawler |
||
| 224 | |||
| 225 | public function getCrawlQueue(): CrawlQueue |
||
| 229 | |||
| 230 | public function executeJavaScript(): Crawler |
||
| 236 | |||
| 237 | public function doNotExecuteJavaScript(): Crawler |
||
| 243 | |||
| 244 | public function mayExecuteJavascript(): bool |
||
| 248 | |||
| 249 | /** |
||
| 250 | * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers |
||
| 251 | * |
||
| 252 | * @return $this |
||
| 253 | */ |
||
| 254 | public function setCrawlObserver($crawlObservers): Crawler |
||
| 262 | |||
| 263 | public function setCrawlObservers(array $crawlObservers): Crawler |
||
| 269 | |||
| 270 | public function addCrawlObserver(CrawlObserver $crawlObserver): Crawler |
||
| 276 | |||
| 277 | public function getCrawlObservers(): CrawlObserverCollection |
||
| 281 | |||
| 282 | public function setCrawlProfile(CrawlProfile $crawlProfile): Crawler |
||
| 288 | |||
| 289 | public function getCrawlProfile(): CrawlProfile |
||
| 293 | |||
| 294 | public function setCrawlFulfilledHandlerClass(string $crawlRequestFulfilledClass): Crawler |
||
| 306 | |||
| 307 | public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): Crawler |
||
| 319 | |||
| 320 | public function setBrowsershot(Browsershot $browsershot) |
||
| 326 | |||
| 327 | public function getBrowsershot(): Browsershot |
||
| 335 | |||
| 336 | public function getBaseUrl(): UriInterface |
||
| 340 | |||
| 341 | public function setProxies(array $proxyConfig): Crawler |
||
| 348 | |||
| 349 | /** |
||
| 350 | * @param \Psr\Http\Message\UriInterface|string $baseUrl |
||
| 351 | */ |
||
| 352 | public function startCrawling($baseUrl) |
||
| 386 | |||
| 387 | public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node |
||
| 415 | |||
| 416 | protected function startCrawlingQueue() |
||
| 431 | |||
| 432 | protected function getConfig() |
||
| 433 | { |
||
| 434 | $config = $this->client->getConfig(); |
||
| 435 | if ($this->usingProxies) { |
||
| 436 | $config['proxy'] = $this->getProxyConfig(); |
||
| 437 | } |
||
| 438 | |||
| 439 | return $config; |
||
| 440 | } |
||
| 441 | |||
| 442 | protected function getProxyConfig() |
||
| 443 | { |
||
| 444 | $ips = collect($this->proxyConfig['ips']); |
||
| 445 | $username = $this->proxyConfig['username']; |
||
| 446 | $password = $this->proxyConfig['password']; |
||
| 447 | $port = $this->proxyConfig['port']; |
||
| 448 | $proxyIp = $ips->random(); |
||
| 449 | |||
| 450 | return "http://{$username}:{$password}@{$proxyIp}:{$port}"; |
||
| 451 | } |
||
| 452 | |||
| 453 | /** |
||
| 454 | * @deprecated This function will be removed in the next major version |
||
| 455 | */ |
||
| 456 | public function endsWith($haystack, $needle) |
||
| 461 | |||
| 462 | protected function createRobotsTxt(UriInterface $uri): RobotsTxt |
||
| 466 | |||
| 467 | protected function getCrawlRequests(): Generator |
||
| 497 | |||
| 498 | public function addToCrawlQueue(CrawlUrl $crawlUrl): Crawler |
||
| 514 | |||
| 515 | public function maximumCrawlCountReached(): bool |
||
| 525 | } |
||
| 526 |
This check looks for assignments to scalar types that may be of the wrong type.
To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.