Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
22 | class Crawler |
||
23 | { |
||
24 | /** @var \GuzzleHttp\Client */ |
||
25 | protected $client; |
||
26 | |||
27 | /** @var \Psr\Http\Message\UriInterface */ |
||
28 | protected $baseUrl; |
||
29 | |||
30 | /** @var \Spatie\Crawler\CrawlObserverCollection */ |
||
31 | protected $crawlObservers; |
||
32 | |||
33 | /** @var \Spatie\Crawler\CrawlProfile */ |
||
34 | protected $crawlProfile; |
||
35 | |||
36 | /** @var RetryProfile */ |
||
37 | protected $retryProfile; |
||
38 | |||
39 | /** @var int */ |
||
40 | protected $concurrency; |
||
41 | |||
42 | /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */ |
||
43 | protected $crawlQueue; |
||
44 | |||
45 | /** @var int */ |
||
46 | protected $crawledUrlCount = 0; |
||
47 | |||
48 | /** @var int|null */ |
||
49 | protected $maximumCrawlCount = null; |
||
50 | |||
51 | /** @var int */ |
||
52 | protected $maximumResponseSize = 1024 * 1024 * 2; |
||
53 | |||
54 | /** @var int|null */ |
||
55 | protected $maximumDepth = null; |
||
56 | |||
57 | /** @var bool */ |
||
58 | protected $respectRobots = true; |
||
59 | |||
60 | /** @var \Tree\Node\Node */ |
||
61 | protected $depthTree; |
||
62 | |||
63 | /** @var bool */ |
||
64 | protected $executeJavaScript = false; |
||
65 | |||
66 | /** @var Browsershot */ |
||
67 | protected $browsershot = null; |
||
68 | |||
69 | /** @var \Spatie\Robots\RobotsTxt */ |
||
70 | protected $robotsTxt = null; |
||
71 | |||
72 | /** @var string */ |
||
73 | protected $crawlRequestFulfilledClass; |
||
74 | |||
75 | /** @var string */ |
||
76 | protected $crawlRequestFailedClass; |
||
77 | |||
78 | /** @var float */ |
||
79 | protected $delayBetweenRequests = 0; |
||
80 | |||
81 | /** @var */ |
||
82 | protected static $defaultClientOptions = [ |
||
83 | RequestOptions::COOKIES => true, |
||
84 | RequestOptions::CONNECT_TIMEOUT => 10, |
||
85 | RequestOptions::TIMEOUT => 10, |
||
86 | RequestOptions::ALLOW_REDIRECTS => false, |
||
87 | ]; |
||
88 | |||
89 | public static function create(array $clientOptions = []): Crawler |
||
99 | |||
100 | public function __construct(Client $client, int $concurrency = 10) |
||
118 | |||
119 | public function setConcurrency(int $concurrency): Crawler |
||
125 | |||
126 | public function setMaximumResponseSize(int $maximumResponseSizeInBytes): Crawler |
||
132 | |||
133 | public function getMaximumResponseSize(): ?int |
||
137 | |||
138 | public function setMaximumCrawlCount(int $maximumCrawlCount): Crawler |
||
144 | |||
145 | public function getMaximumCrawlCount(): ?int |
||
149 | |||
150 | public function getCrawlerUrlCount(): int |
||
154 | |||
155 | public function setMaximumDepth(int $maximumDepth): Crawler |
||
161 | |||
162 | public function getMaximumDepth(): ?int |
||
166 | |||
167 | public function setDelayBetweenRequests(int $delay): Crawler |
||
173 | |||
174 | public function getDelayBetweenRequests(): float |
||
178 | |||
179 | public function ignoreRobots(): Crawler |
||
185 | |||
186 | public function respectRobots(): Crawler |
||
192 | |||
193 | public function mustRespectRobots(): bool |
||
197 | |||
198 | public function getRobotsTxt(): RobotsTxt |
||
202 | |||
203 | public function setCrawlQueue(CrawlQueue $crawlQueue): Crawler |
||
209 | |||
210 | public function getCrawlQueue(): CrawlQueue |
||
214 | |||
215 | public function executeJavaScript(): Crawler |
||
221 | |||
222 | public function doNotExecuteJavaScript(): Crawler |
||
228 | |||
229 | public function mayExecuteJavascript(): bool |
||
233 | |||
234 | /** |
||
235 | * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers |
||
236 | * |
||
237 | * @return $this |
||
238 | */ |
||
239 | public function setCrawlObserver($crawlObservers): Crawler |
||
247 | |||
248 | public function setCrawlObservers(array $crawlObservers): Crawler |
||
254 | |||
255 | public function addCrawlObserver(CrawlObserver $crawlObserver): Crawler |
||
261 | |||
262 | public function getCrawlObservers(): CrawlObserverCollection |
||
266 | |||
267 | public function setCrawlProfile(CrawlProfile $crawlProfile): Crawler |
||
273 | |||
274 | public function getCrawlProfile(): CrawlProfile |
||
278 | |||
279 | public function setRetryProfile(RetryProfile $retryProfile): Crawler |
||
285 | |||
286 | public function getRetryProfile(): RetryProfile |
||
290 | |||
291 | public function setCrawlFulfilledHandlerClass(string $crawlRequestFulfilledClass): Crawler |
||
303 | |||
304 | public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): Crawler |
||
316 | |||
317 | public function setBrowsershot(Browsershot $browsershot) |
||
323 | |||
324 | public function getBrowsershot(): Browsershot |
||
332 | |||
333 | public function getBaseUrl(): UriInterface |
||
337 | |||
338 | /** |
||
339 | * @param \Psr\Http\Message\UriInterface|string $baseUrl |
||
340 | */ |
||
341 | public function startCrawling($baseUrl) |
||
375 | |||
376 | public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node |
||
404 | |||
405 | protected function startCrawlingQueue() |
||
420 | |||
421 | /** |
||
422 | * @deprecated This function will be removed in the next major version |
||
423 | */ |
||
424 | public function endsWith($haystack, $needle) |
||
429 | |||
430 | protected function createRobotsTxt(UriInterface $uri): RobotsTxt |
||
434 | |||
435 | protected function getCrawlRequests(): Generator |
||
457 | |||
458 | public function addToCrawlQueue(CrawlUrl $crawlUrl): Crawler |
||
474 | |||
475 | public function maximumCrawlCountReached(): bool |
||
485 | } |
||
486 |
This check looks for assignments to scalar types that may be of the wrong type.
To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.