Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.
| 1 | <?php  | 
            ||
| 24 | class Crawler  | 
            ||
| 25 | { | 
            ||
| 26 | /** @var \GuzzleHttp\Client */  | 
            ||
| 27 | protected $client;  | 
            ||
| 28 | |||
| 29 | /** @var \Psr\Http\Message\UriInterface */  | 
            ||
| 30 | protected $baseUrl;  | 
            ||
| 31 | |||
| 32 | /** @var array[\Spatie\Crawler\CrawlObserver] */  | 
            ||
| 33 | protected $crawlObservers;  | 
            ||
| 34 | |||
| 35 | /** @var \Spatie\Crawler\CrawlProfile */  | 
            ||
| 36 | protected $crawlProfile;  | 
            ||
| 37 | |||
| 38 | /** @var int */  | 
            ||
| 39 | protected $concurrency;  | 
            ||
| 40 | |||
| 41 | /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */  | 
            ||
| 42 | protected $crawlQueue;  | 
            ||
| 43 | |||
| 44 | /** @var int */  | 
            ||
| 45 | protected $crawledUrlCount = 0;  | 
            ||
| 46 | |||
| 47 | /** @var int|null */  | 
            ||
| 48 | protected $maximumCrawlCount = null;  | 
            ||
| 49 | |||
| 50 | /** @var int */  | 
            ||
| 51 | protected $maximumResponseSize = 1024 * 1024 * 2;  | 
            ||
| 52 | |||
| 53 | /** @var int|null */  | 
            ||
| 54 | protected $maximumDepth = null;  | 
            ||
| 55 | |||
| 56 | /** @var \Tree\Node\Node */  | 
            ||
| 57 | protected $depthTree;  | 
            ||
| 58 | |||
| 59 | /** @var bool */  | 
            ||
| 60 | protected $executeJavaScript = false;  | 
            ||
| 61 | |||
| 62 | /** @var Browsershot */  | 
            ||
| 63 | protected $browsershot = null;  | 
            ||
| 64 | |||
| 65 | protected static $defaultClientOptions = [  | 
            ||
| 66 | RequestOptions::COOKIES => true,  | 
            ||
| 67 | RequestOptions::CONNECT_TIMEOUT => 10,  | 
            ||
| 68 | RequestOptions::TIMEOUT => 10,  | 
            ||
| 69 | RequestOptions::ALLOW_REDIRECTS => false,  | 
            ||
| 70 | ];  | 
            ||
| 71 | |||
| 72 | /**  | 
            ||
| 73 | * @param array $clientOptions  | 
            ||
| 74 | *  | 
            ||
| 75 | * @return static  | 
            ||
| 76 | */  | 
            ||
| 77 | public static function create(array $clientOptions = [])  | 
            ||
| 87 | |||
| 88 | public function __construct(Client $client, int $concurrency = 10)  | 
            ||
| 98 | |||
| 99 | /**  | 
            ||
| 100 | * @param int $concurrency  | 
            ||
| 101 | *  | 
            ||
| 102 | * @return $this  | 
            ||
| 103 | */  | 
            ||
| 104 | public function setConcurrency(int $concurrency)  | 
            ||
| 110 | |||
| 111 | /**  | 
            ||
| 112 | * Responses that are larger that then specified value will be ignored.  | 
            ||
| 113 | *  | 
            ||
| 114 | * @param int $maximumResponseSizeInBytes  | 
            ||
| 115 | *  | 
            ||
| 116 | * @return $this  | 
            ||
| 117 | */  | 
            ||
| 118 | public function setMaximumResponseSize(int $maximumResponseSizeInBytes)  | 
            ||
| 124 | |||
| 125 | /**  | 
            ||
| 126 | * @param int $maximumCrawlCount  | 
            ||
| 127 | *  | 
            ||
| 128 | * @return $this  | 
            ||
| 129 | */  | 
            ||
| 130 | public function setMaximumCrawlCount(int $maximumCrawlCount)  | 
            ||
| 136 | |||
| 137 | /**  | 
            ||
| 138 | * @param int $maximumDepth  | 
            ||
| 139 | *  | 
            ||
| 140 | * @return $this  | 
            ||
| 141 | */  | 
            ||
| 142 | public function setMaximumDepth(int $maximumDepth)  | 
            ||
| 148 | |||
| 149 | /**  | 
            ||
| 150 | * @param CrawlQueue $crawlQueue  | 
            ||
| 151 | * @return $this  | 
            ||
| 152 | */  | 
            ||
| 153 | public function setCrawlQueue(CrawlQueue $crawlQueue)  | 
            ||
| 159 | |||
| 160 | /**  | 
            ||
| 161 | * @return $this  | 
            ||
| 162 | */  | 
            ||
| 163 | public function executeJavaScript()  | 
            ||
| 169 | |||
| 170 | /**  | 
            ||
| 171 | * @return $this  | 
            ||
| 172 | */  | 
            ||
| 173 | public function doNotExecuteJavaScript()  | 
            ||
| 179 | |||
| 180 | /**  | 
            ||
| 181 | * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers  | 
            ||
| 
                                                                                                    
                        
                         | 
                |||
| 182 | *  | 
            ||
| 183 | * @return $this  | 
            ||
| 184 | */  | 
            ||
| 185 | public function setCrawlObserver($crawlObservers)  | 
            ||
| 193 | |||
| 194 | public function setCrawlObservers(array $crawlObservers)  | 
            ||
| 200 | |||
| 201 | public function addCrawlObserver(CrawlObserver $crawlObserver)  | 
            ||
| 207 | |||
| 208 | /**  | 
            ||
| 209 | * @param \Spatie\Crawler\CrawlProfile $crawlProfile  | 
            ||
| 210 | *  | 
            ||
| 211 | * @return $this  | 
            ||
| 212 | */  | 
            ||
| 213 | public function setCrawlProfile(CrawlProfile $crawlProfile)  | 
            ||
| 219 | |||
| 220 | /**  | 
            ||
| 221 | * @param \Psr\Http\Message\UriInterface|string $baseUrl  | 
            ||
| 222 | */  | 
            ||
| 223 | public function startCrawling($baseUrl)  | 
            ||
| 251 | |||
| 252 | protected function startCrawlingQueue()  | 
            ||
| 288 | |||
| 289 | public function endsWith($haystack, $needle)  | 
            ||
| 294 | |||
| 295 | protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string  | 
            ||
| 303 | |||
| 304 | /**  | 
            ||
| 305 | * @param ResponseInterface|null $response  | 
            ||
| 306 | * @param CrawlUrl $crawlUrl  | 
            ||
| 307 | */  | 
            ||
| 308 | protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl)  | 
            ||
| 318 | |||
| 319 | /**  | 
            ||
| 320 | * @param RequestException $exception  | 
            ||
| 321 | * @param CrawlUrl $crawlUrl  | 
            ||
| 322 | */  | 
            ||
| 323 | protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl)  | 
            ||
| 333 | |||
| 334 | protected function getCrawlRequests(): Generator  | 
            ||
| 355 | |||
| 356 | protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl)  | 
            ||
| 393 | |||
| 394 | protected function shouldCrawl(Node $node): bool  | 
            ||
| 402 | |||
| 403 | protected function extractAllLinks(string $html, UriInterface $foundOnUrl): Collection  | 
            ||
| 421 | |||
| 422 | protected function normalizeUrl(UriInterface $url): UriInterface  | 
            ||
| 426 | |||
| 427 | protected function hasCrawlableScheme(UriInterface $uri): bool  | 
            ||
| 431 | |||
| 432 | protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl)  | 
            ||
| 454 | |||
| 455 | protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string  | 
            ||
| 463 | |||
| 464 | protected function getBrowsershot(): Browsershot  | 
            ||
| 474 | |||
| 475 | public function setBrowsershot(Browsershot $browsershot)  | 
            ||
| 481 | |||
| 482 | protected function addToCrawlQueue(CrawlUrl $crawlUrl)  | 
            ||
| 490 | |||
| 491 | protected function maximumCrawlCountReached(): bool  | 
            ||
| 499 | }  | 
            ||
| 500 | 
This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.