Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 26 | class Crawler |
||
| 27 | { |
||
| 28 | /** @var \GuzzleHttp\Client */ |
||
| 29 | protected $client; |
||
| 30 | |||
| 31 | /** @var \Psr\Http\Message\UriInterface */ |
||
| 32 | protected $baseUrl; |
||
| 33 | |||
| 34 | /** @var array[\Spatie\Crawler\CrawlObserver] */ |
||
| 35 | protected $crawlObservers; |
||
| 36 | |||
| 37 | /** @var \Spatie\Crawler\CrawlProfile */ |
||
| 38 | protected $crawlProfile; |
||
| 39 | |||
| 40 | /** @var int */ |
||
| 41 | protected $concurrency; |
||
| 42 | |||
| 43 | /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */ |
||
| 44 | protected $crawlQueue; |
||
| 45 | |||
| 46 | /** @var int */ |
||
| 47 | protected $crawledUrlCount = 0; |
||
| 48 | |||
| 49 | /** @var int|null */ |
||
| 50 | protected $maximumCrawlCount = null; |
||
| 51 | |||
| 52 | /** @var int */ |
||
| 53 | protected $maximumResponseSize = 1024 * 1024 * 2; |
||
| 54 | |||
| 55 | /** @var int|null */ |
||
| 56 | protected $maximumDepth = null; |
||
| 57 | |||
| 58 | /** @var bool */ |
||
| 59 | protected $ignoreRobots = false; |
||
| 60 | |||
| 61 | /** @var \Tree\Node\Node */ |
||
| 62 | protected $depthTree; |
||
| 63 | |||
| 64 | /** @var bool */ |
||
| 65 | protected $executeJavaScript = false; |
||
| 66 | |||
| 67 | /** @var Browsershot */ |
||
| 68 | protected $browsershot = null; |
||
| 69 | |||
| 70 | /** @var \Spatie\Robots\RobotsTxt */ |
||
| 71 | protected $robotsTxt = null; |
||
| 72 | |||
| 73 | protected static $defaultClientOptions = [ |
||
| 74 | RequestOptions::COOKIES => true, |
||
| 75 | RequestOptions::CONNECT_TIMEOUT => 10, |
||
| 76 | RequestOptions::TIMEOUT => 10, |
||
| 77 | RequestOptions::ALLOW_REDIRECTS => false, |
||
| 78 | ]; |
||
| 79 | |||
| 80 | /** |
||
| 81 | * @param array $clientOptions |
||
| 82 | * |
||
| 83 | * @return static |
||
| 84 | */ |
||
| 85 | public static function create(array $clientOptions = []) |
||
| 95 | |||
| 96 | public function __construct(Client $client, int $concurrency = 10) |
||
| 106 | |||
| 107 | /** |
||
| 108 | * @param int $concurrency |
||
| 109 | * |
||
| 110 | * @return $this |
||
| 111 | */ |
||
| 112 | public function setConcurrency(int $concurrency) |
||
| 118 | |||
| 119 | /** |
||
| 120 | * Responses that are larger that then specified value will be ignored. |
||
| 121 | * |
||
| 122 | * @param int $maximumResponseSizeInBytes |
||
| 123 | * |
||
| 124 | * @return $this |
||
| 125 | */ |
||
| 126 | public function setMaximumResponseSize(int $maximumResponseSizeInBytes) |
||
| 132 | |||
| 133 | /** |
||
| 134 | * @param int $maximumCrawlCount |
||
| 135 | * |
||
| 136 | * @return $this |
||
| 137 | */ |
||
| 138 | public function setMaximumCrawlCount(int $maximumCrawlCount) |
||
| 144 | |||
| 145 | /** |
||
| 146 | * @param int $maximumDepth |
||
| 147 | * |
||
| 148 | * @return $this |
||
| 149 | */ |
||
| 150 | public function setMaximumDepth(int $maximumDepth) |
||
| 156 | |||
| 157 | /** |
||
| 158 | * @param bool $ignoreRobots |
||
| 159 | * |
||
| 160 | * @return $this |
||
| 161 | */ |
||
| 162 | public function ignoreRobots(bool $ignoreRobots = true) |
||
| 168 | |||
| 169 | /** |
||
| 170 | * @param CrawlQueue $crawlQueue |
||
| 171 | * |
||
| 172 | * @return $this |
||
| 173 | */ |
||
| 174 | public function setCrawlQueue(CrawlQueue $crawlQueue) |
||
| 180 | |||
| 181 | /** |
||
| 182 | * @return $this |
||
| 183 | */ |
||
| 184 | public function executeJavaScript() |
||
| 190 | |||
| 191 | /** |
||
| 192 | * @return $this |
||
| 193 | */ |
||
| 194 | public function doNotExecuteJavaScript() |
||
| 200 | |||
| 201 | /** |
||
| 202 | * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers |
||
|
|
|||
| 203 | * |
||
| 204 | * @return $this |
||
| 205 | */ |
||
| 206 | public function setCrawlObserver($crawlObservers) |
||
| 214 | |||
| 215 | public function setCrawlObservers(array $crawlObservers) |
||
| 221 | |||
| 222 | public function addCrawlObserver(CrawlObserver $crawlObserver) |
||
| 228 | |||
| 229 | /** |
||
| 230 | * @param \Spatie\Crawler\CrawlProfile $crawlProfile |
||
| 231 | * |
||
| 232 | * @return $this |
||
| 233 | */ |
||
| 234 | public function setCrawlProfile(CrawlProfile $crawlProfile) |
||
| 240 | |||
| 241 | /** |
||
| 242 | * @param \Psr\Http\Message\UriInterface|string $baseUrl |
||
| 243 | */ |
||
| 244 | public function startCrawling($baseUrl) |
||
| 274 | |||
| 275 | protected function startCrawlingQueue() |
||
| 324 | |||
| 325 | public function endsWith($haystack, $needle) |
||
| 330 | |||
| 331 | protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string |
||
| 339 | |||
| 340 | protected function createRobotsTxt(UriInterface $uri): RobotsTxt |
||
| 344 | |||
| 345 | /** |
||
| 346 | * @param ResponseInterface|null $response |
||
| 347 | * @param CrawlUrl $crawlUrl |
||
| 348 | */ |
||
| 349 | protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl) |
||
| 359 | |||
| 360 | /** |
||
| 361 | * @param RequestException $exception |
||
| 362 | * @param CrawlUrl $crawlUrl |
||
| 363 | */ |
||
| 364 | protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl) |
||
| 374 | |||
| 375 | protected function getCrawlRequests(): Generator |
||
| 396 | |||
| 397 | protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl) |
||
| 434 | |||
| 435 | protected function shouldCrawl(Node $node): bool |
||
| 447 | |||
| 448 | /** |
||
| 449 | * @param string $html |
||
| 450 | * @param \Psr\Http\Message\UriInterface $foundOnUrl |
||
| 451 | * |
||
| 452 | * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null |
||
| 453 | */ |
||
| 454 | protected function extractAllLinks(string $html, UriInterface $foundOnUrl) |
||
| 475 | |||
| 476 | protected function normalizeUrl(UriInterface $url): UriInterface |
||
| 480 | |||
| 481 | protected function hasCrawlableScheme(UriInterface $uri): bool |
||
| 485 | |||
| 486 | protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl) |
||
| 508 | |||
| 509 | protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string |
||
| 517 | |||
| 518 | protected function getBrowsershot(): Browsershot |
||
| 528 | |||
| 529 | public function setBrowsershot(Browsershot $browsershot) |
||
| 535 | |||
| 536 | protected function addToCrawlQueue(CrawlUrl $crawlUrl) |
||
| 544 | |||
| 545 | protected function maximumCrawlCountReached(): bool |
||
| 553 | |||
| 554 | protected function mayIndex(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool |
||
| 562 | |||
| 563 | protected function mayFollow(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool |
||
| 571 | } |
||
| 572 |
This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.