Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.
| 1 | <?php | ||
| 26 | class Crawler | ||
| 27 | { | ||
| 28 | /** @var \GuzzleHttp\Client */ | ||
| 29 | protected $client; | ||
| 30 | |||
| 31 | /** @var \Psr\Http\Message\UriInterface */ | ||
| 32 | protected $baseUrl; | ||
| 33 | |||
| 34 | /** @var array[\Spatie\Crawler\CrawlObserver] */ | ||
| 35 | protected $crawlObservers; | ||
| 36 | |||
| 37 | /** @var \Spatie\Crawler\CrawlProfile */ | ||
| 38 | protected $crawlProfile; | ||
| 39 | |||
| 40 | /** @var int */ | ||
| 41 | protected $concurrency; | ||
| 42 | |||
| 43 | /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */ | ||
| 44 | protected $crawlQueue; | ||
| 45 | |||
| 46 | /** @var int */ | ||
| 47 | protected $crawledUrlCount = 0; | ||
| 48 | |||
| 49 | /** @var int|null */ | ||
| 50 | protected $maximumCrawlCount = null; | ||
| 51 | |||
| 52 | /** @var int */ | ||
| 53 | protected $maximumResponseSize = 1024 * 1024 * 2; | ||
| 54 | |||
| 55 | /** @var int|null */ | ||
| 56 | protected $maximumDepth = null; | ||
| 57 | |||
| 58 | /** @var bool */ | ||
| 59 | protected $respectRobots = true; | ||
| 60 | |||
| 61 | /** @var \Tree\Node\Node */ | ||
| 62 | protected $depthTree; | ||
| 63 | |||
| 64 | /** @var bool */ | ||
| 65 | protected $executeJavaScript = false; | ||
| 66 | |||
| 67 | /** @var Browsershot */ | ||
| 68 | protected $browsershot = null; | ||
| 69 | |||
| 70 | /** @var \Spatie\Robots\RobotsTxt */ | ||
| 71 | protected $robotsTxt = null; | ||
| 72 | |||
| 73 | protected static $defaultClientOptions = [ | ||
| 74 | RequestOptions::COOKIES => true, | ||
| 75 | RequestOptions::CONNECT_TIMEOUT => 10, | ||
| 76 | RequestOptions::TIMEOUT => 10, | ||
| 77 | RequestOptions::ALLOW_REDIRECTS => false, | ||
| 78 | ]; | ||
| 79 | |||
| 80 | /** | ||
| 81 | * @param array $clientOptions | ||
| 82 | * | ||
| 83 | * @return static | ||
| 84 | */ | ||
| 85 | public static function create(array $clientOptions = []) | ||
| 95 | |||
| 96 | public function __construct(Client $client, int $concurrency = 10) | ||
| 106 | |||
| 107 | /** | ||
| 108 | * @param int $concurrency | ||
| 109 | * | ||
| 110 | * @return $this | ||
| 111 | */ | ||
| 112 | public function setConcurrency(int $concurrency) | ||
| 118 | |||
| 119 | /** | ||
| 120 | * Responses that are larger that then specified value will be ignored. | ||
| 121 | * | ||
| 122 | * @param int $maximumResponseSizeInBytes | ||
| 123 | * | ||
| 124 | * @return $this | ||
| 125 | */ | ||
| 126 | public function setMaximumResponseSize(int $maximumResponseSizeInBytes) | ||
| 132 | |||
| 133 | /** | ||
| 134 | * @param int $maximumCrawlCount | ||
| 135 | * | ||
| 136 | * @return $this | ||
| 137 | */ | ||
| 138 | public function setMaximumCrawlCount(int $maximumCrawlCount) | ||
| 144 | |||
| 145 | /** | ||
| 146 | * @param int $maximumDepth | ||
| 147 | * | ||
| 148 | * @return $this | ||
| 149 | */ | ||
| 150 | public function setMaximumDepth(int $maximumDepth) | ||
| 156 | |||
| 157 | /** | ||
| 158 | * @return $this | ||
| 159 | */ | ||
| 160 | public function ignoreRobots() | ||
| 166 | |||
| 167 | /** | ||
| 168 | * @return $this | ||
| 169 | */ | ||
| 170 | public function respectRobots() | ||
| 176 | |||
| 177 | /** | ||
| 178 | * @param CrawlQueue $crawlQueue | ||
| 179 | * | ||
| 180 | * @return $this | ||
| 181 | */ | ||
| 182 | public function setCrawlQueue(CrawlQueue $crawlQueue) | ||
| 188 | |||
| 189 | /** | ||
| 190 | * @return $this | ||
| 191 | */ | ||
| 192 | public function executeJavaScript() | ||
| 198 | |||
| 199 | /** | ||
| 200 | * @return $this | ||
| 201 | */ | ||
| 202 | public function doNotExecuteJavaScript() | ||
| 208 | |||
| 209 | /** | ||
| 210 | * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers | ||
|  | |||
| 211 | * | ||
| 212 | * @return $this | ||
| 213 | */ | ||
| 214 | public function setCrawlObserver($crawlObservers) | ||
| 222 | |||
| 223 | public function setCrawlObservers(array $crawlObservers) | ||
| 229 | |||
| 230 | public function addCrawlObserver(CrawlObserver $crawlObserver) | ||
| 236 | |||
| 237 | /** | ||
| 238 | * @param \Spatie\Crawler\CrawlProfile $crawlProfile | ||
| 239 | * | ||
| 240 | * @return $this | ||
| 241 | */ | ||
| 242 | public function setCrawlProfile(CrawlProfile $crawlProfile) | ||
| 248 | |||
| 249 | /** | ||
| 250 | * @param \Psr\Http\Message\UriInterface|string $baseUrl | ||
| 251 | */ | ||
| 252 | public function startCrawling($baseUrl) | ||
| 284 | |||
| 285 | protected function startCrawlingQueue() | ||
| 334 | |||
| 335 | public function endsWith($haystack, $needle) | ||
| 340 | |||
| 341 | protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string | ||
| 349 | |||
| 350 | protected function createRobotsTxt(UriInterface $uri): RobotsTxt | ||
| 354 | |||
| 355 | /** | ||
| 356 | * @param ResponseInterface|null $response | ||
| 357 | * @param CrawlUrl $crawlUrl | ||
| 358 | */ | ||
| 359 | protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl) | ||
| 369 | |||
| 370 | /** | ||
| 371 | * @param RequestException $exception | ||
| 372 | * @param CrawlUrl $crawlUrl | ||
| 373 | */ | ||
| 374 | protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl) | ||
| 384 | |||
| 385 | protected function getCrawlRequests(): Generator | ||
| 406 | |||
| 407 | protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl) | ||
| 444 | |||
| 445 | protected function shouldCrawl(Node $node): bool | ||
| 457 | |||
| 458 | /** | ||
| 459 | * @param string $html | ||
| 460 | * @param \Psr\Http\Message\UriInterface $foundOnUrl | ||
| 461 | * | ||
| 462 | * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null | ||
| 463 | */ | ||
| 464 | protected function extractAllLinks(string $html, UriInterface $foundOnUrl) | ||
| 485 | |||
| 486 | protected function normalizeUrl(UriInterface $url): UriInterface | ||
| 490 | |||
| 491 | protected function hasCrawlableScheme(UriInterface $uri): bool | ||
| 495 | |||
| 496 | protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl) | ||
| 518 | |||
| 519 | protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string | ||
| 527 | |||
| 528 | protected function getBrowsershot(): Browsershot | ||
| 538 | |||
| 539 | public function setBrowsershot(Browsershot $browsershot) | ||
| 545 | |||
| 546 | protected function addToCrawlQueue(CrawlUrl $crawlUrl) | ||
| 554 | |||
| 555 | protected function maximumCrawlCountReached(): bool | ||
| 563 | |||
| 564 | View Code Duplication | protected function mayIndex(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool | |
| 580 | |||
| 581 | View Code Duplication | protected function mayFollow(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool | |
| 597 | } | ||
| 598 | 
This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.