Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
21 | class Crawler |
||
22 | { |
||
23 | /** @var \GuzzleHttp\Client */ |
||
24 | protected $client; |
||
25 | |||
26 | /** @var \Psr\Http\Message\UriInterface */ |
||
27 | protected $baseUrl; |
||
28 | |||
29 | /** @var \Spatie\Crawler\CrawlObserverCollection */ |
||
30 | protected $crawlObservers; |
||
31 | |||
32 | /** @var \Spatie\Crawler\CrawlProfile */ |
||
33 | protected $crawlProfile; |
||
34 | |||
35 | /** @var int */ |
||
36 | protected $concurrency; |
||
37 | |||
38 | /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */ |
||
39 | protected $crawlQueue; |
||
40 | |||
41 | /** @var int */ |
||
42 | protected $crawledUrlCount = 0; |
||
43 | |||
44 | /** @var int|null */ |
||
45 | protected $maximumCrawlCount = null; |
||
46 | |||
47 | /** @var int */ |
||
48 | protected $maximumResponseSize = 1024 * 1024 * 2; |
||
49 | |||
50 | /** @var int|null */ |
||
51 | protected $maximumDepth = null; |
||
52 | |||
53 | /** @var int|null */ |
||
54 | protected $poolItemLimit = null; |
||
55 | |||
56 | /** @var bool */ |
||
57 | protected $respectRobots = true; |
||
58 | |||
59 | /** @var \Tree\Node\Node */ |
||
60 | protected $depthTree; |
||
61 | |||
62 | /** @var bool */ |
||
63 | protected $executeJavaScript = false; |
||
64 | |||
65 | /** @var Browsershot */ |
||
66 | protected $browsershot = null; |
||
67 | |||
68 | /** @var \Spatie\Robots\RobotsTxt */ |
||
69 | protected $robotsTxt = null; |
||
70 | |||
71 | /** @var string */ |
||
72 | protected $crawlRequestFulfilledClass; |
||
73 | |||
74 | /** @var string */ |
||
75 | protected $crawlRequestFailedClass; |
||
76 | |||
77 | /** @var float */ |
||
78 | protected $delayBetweenRequests = 0; |
||
79 | |||
80 | /** @var */ |
||
81 | protected static $defaultClientOptions = [ |
||
82 | RequestOptions::COOKIES => true, |
||
83 | RequestOptions::CONNECT_TIMEOUT => 10, |
||
84 | RequestOptions::TIMEOUT => 10, |
||
85 | RequestOptions::ALLOW_REDIRECTS => false, |
||
86 | ]; |
||
87 | |||
88 | /** @var array */ |
||
89 | protected $proxiesConfig = null; |
||
90 | |||
91 | /** @var bool */ |
||
92 | protected $usingProxies = false; |
||
93 | |||
94 | public static function create(array $clientOptions = []): Crawler |
||
104 | |||
105 | public function __construct(Client $client, int $concurrency = 10) |
||
121 | |||
122 | public function setConcurrency(int $concurrency): Crawler |
||
128 | |||
129 | public function setMaximumResponseSize(int $maximumResponseSizeInBytes): Crawler |
||
135 | |||
136 | public function getMaximumResponseSize(): ?int |
||
140 | |||
141 | public function setMaximumCrawlCount(int $maximumCrawlCount): Crawler |
||
147 | |||
148 | public function getMaximumCrawlCount(): ?int |
||
152 | |||
153 | public function getCrawlerUrlCount(): int |
||
157 | |||
158 | public function setMaximumDepth(int $maximumDepth): Crawler |
||
164 | |||
165 | public function getPoolItemLimit(): ?int |
||
169 | |||
170 | public function setPoolItemLimit(int $poolItemLimit): Crawler |
||
176 | |||
177 | public function getMaximumDepth(): ?int |
||
181 | |||
182 | public function setDelayBetweenRequests(int $delay): Crawler |
||
188 | |||
189 | public function getDelayBetweenRequests(): float |
||
193 | |||
194 | public function ignoreRobots(): Crawler |
||
200 | |||
201 | public function respectRobots(): Crawler |
||
207 | |||
208 | public function mustRespectRobots(): bool |
||
212 | |||
213 | public function getRobotsTxt(): RobotsTxt |
||
217 | |||
218 | public function setCrawlQueue(CrawlQueue $crawlQueue): Crawler |
||
224 | |||
225 | public function getCrawlQueue(): CrawlQueue |
||
229 | |||
230 | public function executeJavaScript(): Crawler |
||
236 | |||
237 | public function doNotExecuteJavaScript(): Crawler |
||
243 | |||
244 | public function mayExecuteJavascript(): bool |
||
248 | |||
249 | /** |
||
250 | * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers |
||
251 | * |
||
252 | * @return $this |
||
253 | */ |
||
254 | public function setCrawlObserver($crawlObservers): Crawler |
||
262 | |||
263 | public function setCrawlObservers(array $crawlObservers): Crawler |
||
269 | |||
270 | public function addCrawlObserver(CrawlObserver $crawlObserver): Crawler |
||
276 | |||
277 | public function getCrawlObservers(): CrawlObserverCollection |
||
281 | |||
282 | public function setCrawlProfile(CrawlProfile $crawlProfile): Crawler |
||
288 | |||
289 | public function getCrawlProfile(): CrawlProfile |
||
293 | |||
294 | public function setCrawlFulfilledHandlerClass(string $crawlRequestFulfilledClass): Crawler |
||
306 | |||
307 | public function setCrawlFailedHandlerClass(string $crawlRequestFailedClass): Crawler |
||
319 | |||
320 | public function setBrowsershot(Browsershot $browsershot) |
||
326 | |||
327 | public function getBrowsershot(): Browsershot |
||
335 | |||
336 | public function getBaseUrl(): UriInterface |
||
340 | |||
341 | public function setProxies(array $proxyConfig): Crawler |
||
348 | |||
349 | /** |
||
350 | * @param \Psr\Http\Message\UriInterface|string $baseUrl |
||
351 | */ |
||
352 | public function startCrawling($baseUrl) |
||
386 | |||
387 | public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node |
||
415 | |||
416 | protected function startCrawlingQueue() |
||
431 | |||
432 | protected function getConfig() |
||
440 | |||
441 | protected function getProxyConfig() |
||
451 | |||
452 | /** |
||
453 | * @deprecated This function will be removed in the next major version |
||
454 | */ |
||
455 | public function endsWith($haystack, $needle) |
||
460 | |||
461 | protected function createRobotsTxt(UriInterface $uri): RobotsTxt |
||
465 | |||
466 | protected function getCrawlRequests(): Generator |
||
496 | |||
497 | public function addToCrawlQueue(CrawlUrl $crawlUrl): Crawler |
||
513 | |||
514 | public function maximumCrawlCountReached(): bool |
||
524 | } |
||
525 |
This check looks for assignments to scalar types that may be of the wrong type.
To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.