Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 22 | class Crawler |
||
| 23 | { |
||
| 24 | /** @var \GuzzleHttp\Client */ |
||
| 25 | protected $client; |
||
| 26 | |||
| 27 | /** @var \Psr\Http\Message\UriInterface */ |
||
| 28 | protected $baseUrl; |
||
| 29 | |||
| 30 | /** @var \Spatie\Crawler\CrawlObserver */ |
||
| 31 | protected $crawlObserver; |
||
| 32 | |||
| 33 | /** @var \Spatie\Crawler\CrawlProfile */ |
||
| 34 | protected $crawlProfile; |
||
| 35 | |||
| 36 | /** @var int */ |
||
| 37 | protected $concurrency; |
||
| 38 | |||
| 39 | /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */ |
||
| 40 | protected $crawlQueue; |
||
| 41 | |||
| 42 | /** @var int */ |
||
| 43 | protected $crawledUrlCount = 0; |
||
| 44 | |||
| 45 | /** @var int|null */ |
||
| 46 | protected $maximumCrawlCount = null; |
||
| 47 | |||
| 48 | /** @var int|null */ |
||
| 49 | protected $maximumDepth = null; |
||
| 50 | |||
| 51 | /** @var \Tree\Node\Node */ |
||
| 52 | protected $depthTree; |
||
| 53 | |||
| 54 | /** @var false */ |
||
| 55 | protected $executeJavaScript = false; |
||
| 56 | |||
| 57 | /** @var Browsershot */ |
||
| 58 | protected $browsershot = null; |
||
| 59 | |||
| 60 | protected static $defaultClientOptions = [ |
||
| 61 | RequestOptions::COOKIES => true, |
||
| 62 | RequestOptions::CONNECT_TIMEOUT => 10, |
||
| 63 | RequestOptions::TIMEOUT => 10, |
||
| 64 | RequestOptions::ALLOW_REDIRECTS => false, |
||
| 65 | ]; |
||
| 66 | |||
| 67 | /** |
||
| 68 | * @param array $clientOptions |
||
| 69 | * |
||
| 70 | * @return static |
||
| 71 | */ |
||
| 72 | public static function create(array $clientOptions = []) |
||
| 73 | { |
||
| 74 | $clientOptions = (count($clientOptions)) |
||
| 75 | ? $clientOptions |
||
| 76 | : self::$defaultClientOptions; |
||
| 77 | |||
| 78 | $client = new Client($clientOptions); |
||
| 79 | |||
| 80 | return new static($client); |
||
| 81 | } |
||
| 82 | |||
| 83 | public function __construct(Client $client, int $concurrency = 10) |
||
| 84 | { |
||
| 85 | $this->client = $client; |
||
| 86 | |||
| 87 | $this->concurrency = $concurrency; |
||
| 88 | |||
| 89 | $this->crawlProfile = new CrawlAllUrls(); |
||
| 90 | |||
| 91 | $this->crawlQueue = new CollectionCrawlQueue(); |
||
| 92 | } |
||
| 93 | |||
| 94 | /** |
||
| 95 | * @param int $concurrency |
||
| 96 | * |
||
| 97 | * @return $this |
||
| 98 | */ |
||
| 99 | public function setConcurrency(int $concurrency) |
||
| 100 | { |
||
| 101 | $this->concurrency = $concurrency; |
||
| 102 | |||
| 103 | return $this; |
||
| 104 | } |
||
| 105 | |||
| 106 | /** |
||
| 107 | * @param int $maximumCrawlCount |
||
| 108 | * |
||
| 109 | * @return $this |
||
| 110 | */ |
||
| 111 | public function setMaximumCrawlCount(int $maximumCrawlCount) |
||
| 112 | { |
||
| 113 | $this->maximumCrawlCount = $maximumCrawlCount; |
||
| 114 | |||
| 115 | return $this; |
||
| 116 | } |
||
| 117 | |||
| 118 | /** |
||
| 119 | * @param int $maximumDepth |
||
| 120 | * |
||
| 121 | * @return $this |
||
| 122 | */ |
||
| 123 | public function setMaximumDepth(int $maximumDepth) |
||
| 124 | { |
||
| 125 | $this->maximumDepth = $maximumDepth; |
||
| 126 | |||
| 127 | return $this; |
||
| 128 | } |
||
| 129 | |||
| 130 | /** |
||
| 131 | * @param CrawlQueue $crawlQueue |
||
| 132 | * @return $this |
||
| 133 | */ |
||
| 134 | public function setCrawlQueue(CrawlQueue $crawlQueue) |
||
| 135 | { |
||
| 136 | $this->crawlQueue = $crawlQueue; |
||
| 137 | |||
| 138 | return $this; |
||
| 139 | } |
||
| 140 | |||
| 141 | /** |
||
| 142 | * @return $this |
||
| 143 | */ |
||
| 144 | public function executeJavaScript() |
||
| 145 | { |
||
| 146 | $this->executeJavaScript = true; |
||
|
|
|||
| 147 | |||
| 148 | return $this; |
||
| 149 | } |
||
| 150 | |||
| 151 | /** |
||
| 152 | * @return $this |
||
| 153 | */ |
||
| 154 | public function doNotExecuteJavaScript() |
||
| 160 | |||
| 161 | /** |
||
| 162 | * @param \Spatie\Crawler\CrawlObserver $crawlObserver |
||
| 163 | * |
||
| 164 | * @return $this |
||
| 165 | */ |
||
| 166 | public function setCrawlObserver(CrawlObserver $crawlObserver) |
||
| 172 | |||
| 173 | /** |
||
| 174 | * @param \Spatie\Crawler\CrawlProfile $crawlProfile |
||
| 175 | * |
||
| 176 | * @return $this |
||
| 177 | */ |
||
| 178 | public function setCrawlProfile(CrawlProfile $crawlProfile) |
||
| 184 | |||
| 185 | /** |
||
| 186 | * @param \Psr\Http\Message\UriInterface|string $baseUrl |
||
| 187 | */ |
||
| 188 | public function startCrawling($baseUrl) |
||
| 189 | { |
||
| 190 | if (! $baseUrl instanceof UriInterface) { |
||
| 191 | $baseUrl = new Uri($baseUrl); |
||
| 214 | |||
| 215 | protected function startCrawlingQueue() |
||
| 248 | |||
| 249 | /** |
||
| 250 | * @param ResponseInterface|null $response |
||
| 251 | * @param CrawlUrl $crawlUrl |
||
| 252 | */ |
||
| 253 | protected function handleResponse($response, CrawlUrl $crawlUrl) |
||
| 257 | |||
| 258 | protected function getCrawlRequests(): Generator |
||
| 276 | |||
| 277 | protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl) |
||
| 310 | |||
| 311 | protected function shouldCrawl(Node $node): bool |
||
| 319 | |||
| 320 | protected function extractAllLinks(string $html, UriInterface $foundOnUrl): Collection |
||
| 333 | |||
| 334 | protected function normalizeUrl(UriInterface $url): UriInterface |
||
| 338 | |||
| 339 | protected function hasCrawlableScheme(UriInterface $uri): bool |
||
| 343 | |||
| 344 | protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl) |
||
| 366 | |||
| 367 | protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string |
||
| 375 | |||
| 376 | protected function getBrowsershot(): Browsershot |
||
| 386 | |||
| 387 | public function setBrowsershot(Browsershot $browsershot) |
||
| 393 | |||
| 394 | protected function addToCrawlQueue(CrawlUrl $crawlUrl) |
||
| 402 | |||
| 403 | protected function maximumCrawlCountReached(): bool |
||
| 411 | } |
||
| 412 |
This check looks for assignments to scalar types that may be of the wrong type.
To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.