Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
26 | class Crawler |
||
27 | { |
||
28 | /** @var \GuzzleHttp\Client */ |
||
29 | protected $client; |
||
30 | |||
31 | /** @var \Psr\Http\Message\UriInterface */ |
||
32 | protected $baseUrl; |
||
33 | |||
34 | /** @var array[\Spatie\Crawler\CrawlObserver] */ |
||
35 | protected $crawlObservers; |
||
36 | |||
37 | /** @var \Spatie\Crawler\CrawlProfile */ |
||
38 | protected $crawlProfile; |
||
39 | |||
40 | /** @var int */ |
||
41 | protected $concurrency; |
||
42 | |||
43 | /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */ |
||
44 | protected $crawlQueue; |
||
45 | |||
46 | /** @var int */ |
||
47 | protected $crawledUrlCount = 0; |
||
48 | |||
49 | /** @var int|null */ |
||
50 | protected $maximumCrawlCount = null; |
||
51 | |||
52 | /** @var int */ |
||
53 | protected $maximumResponseSize = 1024 * 1024 * 2; |
||
54 | |||
55 | /** @var int|null */ |
||
56 | protected $maximumDepth = null; |
||
57 | |||
58 | /** @var bool */ |
||
59 | protected $respectRobots = true; |
||
60 | |||
61 | /** @var \Tree\Node\Node */ |
||
62 | protected $depthTree; |
||
63 | |||
64 | /** @var bool */ |
||
65 | protected $executeJavaScript = false; |
||
66 | |||
67 | /** @var Browsershot */ |
||
68 | protected $browsershot = null; |
||
69 | |||
70 | /** @var \Spatie\Robots\RobotsTxt */ |
||
71 | protected $robotsTxt = null; |
||
72 | |||
73 | protected static $defaultClientOptions = [ |
||
74 | RequestOptions::COOKIES => true, |
||
75 | RequestOptions::CONNECT_TIMEOUT => 10, |
||
76 | RequestOptions::TIMEOUT => 10, |
||
77 | RequestOptions::ALLOW_REDIRECTS => false, |
||
78 | ]; |
||
79 | |||
80 | /** |
||
81 | * @param array $clientOptions |
||
82 | * |
||
83 | * @return static |
||
84 | */ |
||
85 | public static function create(array $clientOptions = []) |
||
95 | |||
96 | public function __construct(Client $client, int $concurrency = 10) |
||
106 | |||
107 | /** |
||
108 | * @param int $concurrency |
||
109 | * |
||
110 | * @return $this |
||
111 | */ |
||
112 | public function setConcurrency(int $concurrency) |
||
118 | |||
119 | /** |
||
120 | * Responses that are larger that then specified value will be ignored. |
||
121 | * |
||
122 | * @param int $maximumResponseSizeInBytes |
||
123 | * |
||
124 | * @return $this |
||
125 | */ |
||
126 | public function setMaximumResponseSize(int $maximumResponseSizeInBytes) |
||
132 | |||
133 | /** |
||
134 | * @param int $maximumCrawlCount |
||
135 | * |
||
136 | * @return $this |
||
137 | */ |
||
138 | public function setMaximumCrawlCount(int $maximumCrawlCount) |
||
144 | |||
145 | /** |
||
146 | * @param int $maximumDepth |
||
147 | * |
||
148 | * @return $this |
||
149 | */ |
||
150 | public function setMaximumDepth(int $maximumDepth) |
||
156 | |||
157 | /** |
||
158 | * @return $this |
||
159 | */ |
||
160 | public function ignoreRobots() |
||
166 | |||
167 | /** |
||
168 | * @return $this |
||
169 | */ |
||
170 | public function respectRobots() |
||
176 | |||
177 | /** |
||
178 | * @param CrawlQueue $crawlQueue |
||
179 | * |
||
180 | * @return $this |
||
181 | */ |
||
182 | public function setCrawlQueue(CrawlQueue $crawlQueue) |
||
188 | |||
189 | /** |
||
190 | * @return $this |
||
191 | */ |
||
192 | public function executeJavaScript() |
||
198 | |||
199 | /** |
||
200 | * @return $this |
||
201 | */ |
||
202 | public function doNotExecuteJavaScript() |
||
208 | |||
209 | /** |
||
210 | * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers |
||
|
|||
211 | * |
||
212 | * @return $this |
||
213 | */ |
||
214 | public function setCrawlObserver($crawlObservers) |
||
222 | |||
223 | public function setCrawlObservers(array $crawlObservers) |
||
229 | |||
230 | public function addCrawlObserver(CrawlObserver $crawlObserver) |
||
236 | |||
237 | /** |
||
238 | * @param \Spatie\Crawler\CrawlProfile $crawlProfile |
||
239 | * |
||
240 | * @return $this |
||
241 | */ |
||
242 | public function setCrawlProfile(CrawlProfile $crawlProfile) |
||
248 | |||
249 | /** |
||
250 | * @param \Psr\Http\Message\UriInterface|string $baseUrl |
||
251 | */ |
||
252 | public function startCrawling($baseUrl) |
||
282 | |||
283 | protected function startCrawlingQueue() |
||
332 | |||
333 | public function endsWith($haystack, $needle) |
||
338 | |||
339 | protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string |
||
347 | |||
348 | protected function createRobotsTxt(UriInterface $uri): RobotsTxt |
||
352 | |||
353 | /** |
||
354 | * @param ResponseInterface|null $response |
||
355 | * @param CrawlUrl $crawlUrl |
||
356 | */ |
||
357 | protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl) |
||
367 | |||
368 | /** |
||
369 | * @param RequestException $exception |
||
370 | * @param CrawlUrl $crawlUrl |
||
371 | */ |
||
372 | protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl) |
||
382 | |||
383 | protected function getCrawlRequests(): Generator |
||
404 | |||
405 | protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl) |
||
442 | |||
443 | protected function shouldCrawl(Node $node): bool |
||
455 | |||
456 | /** |
||
457 | * @param string $html |
||
458 | * @param \Psr\Http\Message\UriInterface $foundOnUrl |
||
459 | * |
||
460 | * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null |
||
461 | */ |
||
462 | protected function extractAllLinks(string $html, UriInterface $foundOnUrl) |
||
483 | |||
484 | protected function normalizeUrl(UriInterface $url): UriInterface |
||
488 | |||
489 | protected function hasCrawlableScheme(UriInterface $uri): bool |
||
493 | |||
494 | protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl) |
||
516 | |||
517 | protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string |
||
525 | |||
526 | protected function getBrowsershot(): Browsershot |
||
536 | |||
537 | public function setBrowsershot(Browsershot $browsershot) |
||
543 | |||
544 | protected function addToCrawlQueue(CrawlUrl $crawlUrl) |
||
552 | |||
553 | protected function maximumCrawlCountReached(): bool |
||
561 | |||
562 | View Code Duplication | protected function mayIndex(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool |
|
578 | |||
579 | View Code Duplication | protected function mayFollow(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool |
|
595 | } |
||
596 |
This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.