Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
27 | class Crawler |
||
28 | { |
||
29 | /** @var \GuzzleHttp\Client */ |
||
30 | protected $client; |
||
31 | |||
32 | /** @var \Psr\Http\Message\UriInterface */ |
||
33 | protected $baseUrl; |
||
34 | |||
35 | /** @var array[\Spatie\Crawler\CrawlObserver] */ |
||
36 | protected $crawlObservers; |
||
37 | |||
38 | /** @var \Spatie\Crawler\CrawlProfile */ |
||
39 | protected $crawlProfile; |
||
40 | |||
41 | /** @var int */ |
||
42 | protected $concurrency; |
||
43 | |||
44 | /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */ |
||
45 | protected $crawlQueue; |
||
46 | |||
47 | /** @var int */ |
||
48 | protected $crawledUrlCount = 0; |
||
49 | |||
50 | /** @var int|null */ |
||
51 | protected $maximumCrawlCount = null; |
||
52 | |||
53 | /** @var int */ |
||
54 | protected $maximumResponseSize = 1024 * 1024 * 2; |
||
55 | |||
56 | /** @var int|null */ |
||
57 | protected $maximumDepth = null; |
||
58 | |||
59 | /** @var bool */ |
||
60 | protected $ignoreRobots = false; |
||
61 | |||
62 | /** @var \Tree\Node\Node */ |
||
63 | protected $depthTree; |
||
64 | |||
65 | /** @var bool */ |
||
66 | protected $executeJavaScript = false; |
||
67 | |||
68 | /** @var Browsershot */ |
||
69 | protected $browsershot = null; |
||
70 | |||
71 | /** @var \Spatie\Robots\RobotsTxt */ |
||
72 | private $robotsTxt = null; |
||
73 | |||
74 | protected static $defaultClientOptions = [ |
||
75 | RequestOptions::COOKIES => true, |
||
76 | RequestOptions::CONNECT_TIMEOUT => 10, |
||
77 | RequestOptions::TIMEOUT => 10, |
||
78 | RequestOptions::ALLOW_REDIRECTS => false, |
||
79 | ]; |
||
80 | |||
81 | /** |
||
82 | * @param array $clientOptions |
||
83 | * |
||
84 | * @return static |
||
85 | */ |
||
86 | public static function create(array $clientOptions = []) |
||
96 | |||
97 | public function __construct(Client $client, int $concurrency = 10) |
||
98 | { |
||
99 | $this->client = $client; |
||
100 | |||
101 | $this->concurrency = $concurrency; |
||
102 | |||
103 | $this->crawlProfile = new CrawlAllUrls(); |
||
104 | |||
105 | $this->crawlQueue = new CollectionCrawlQueue(); |
||
106 | } |
||
107 | |||
108 | /** |
||
109 | * @param int $concurrency |
||
110 | * |
||
111 | * @return $this |
||
112 | */ |
||
113 | public function setConcurrency(int $concurrency) |
||
119 | |||
120 | /** |
||
121 | * Responses that are larger that then specified value will be ignored. |
||
122 | * |
||
123 | * @param int $maximumResponseSizeInBytes |
||
124 | * |
||
125 | * @return $this |
||
126 | */ |
||
127 | public function setMaximumResponseSize(int $maximumResponseSizeInBytes) |
||
133 | |||
134 | /** |
||
135 | * @param int $maximumCrawlCount |
||
136 | * |
||
137 | * @return $this |
||
138 | */ |
||
139 | public function setMaximumCrawlCount(int $maximumCrawlCount) |
||
145 | |||
146 | /** |
||
147 | * @param int $maximumDepth |
||
148 | * |
||
149 | * @return $this |
||
150 | */ |
||
151 | public function setMaximumDepth(int $maximumDepth) |
||
157 | |||
158 | /** |
||
159 | * @param bool $ignoreRobots |
||
160 | * |
||
161 | * @return $this |
||
162 | */ |
||
163 | public function ignoreRobots(bool $ignoreRobots = true) |
||
169 | |||
170 | /** |
||
171 | * @param CrawlQueue $crawlQueue |
||
172 | * |
||
173 | * @return $this |
||
174 | */ |
||
175 | public function setCrawlQueue(CrawlQueue $crawlQueue) |
||
181 | |||
182 | /** |
||
183 | * @return $this |
||
184 | */ |
||
185 | public function executeJavaScript() |
||
191 | |||
192 | /** |
||
193 | * @return $this |
||
194 | */ |
||
195 | public function doNotExecuteJavaScript() |
||
201 | |||
202 | /** |
||
203 | * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers |
||
|
|||
204 | * |
||
205 | * @return $this |
||
206 | */ |
||
207 | public function setCrawlObserver($crawlObservers) |
||
215 | |||
216 | public function setCrawlObservers(array $crawlObservers) |
||
222 | |||
223 | public function addCrawlObserver(CrawlObserver $crawlObserver) |
||
229 | |||
230 | /** |
||
231 | * @param \Spatie\Crawler\CrawlProfile $crawlProfile |
||
232 | * |
||
233 | * @return $this |
||
234 | */ |
||
235 | public function setCrawlProfile(CrawlProfile $crawlProfile) |
||
241 | |||
242 | /** |
||
243 | * @param \Psr\Http\Message\UriInterface|string $baseUrl |
||
244 | */ |
||
245 | public function startCrawling($baseUrl) |
||
246 | { |
||
247 | if (! $baseUrl instanceof UriInterface) { |
||
248 | $baseUrl = new Uri($baseUrl); |
||
249 | } |
||
250 | |||
251 | if ($baseUrl->getScheme() === '') { |
||
252 | $baseUrl = $baseUrl->withScheme('http'); |
||
253 | } |
||
254 | |||
255 | if ($baseUrl->getPath() === '') { |
||
256 | $baseUrl = $baseUrl->withPath('/'); |
||
257 | } |
||
258 | |||
259 | $this->baseUrl = $baseUrl; |
||
260 | |||
261 | $crawlUrl = CrawlUrl::create($this->baseUrl); |
||
262 | |||
263 | $this->robotsTxt = $this->createRobotsTxt($crawlUrl->url); |
||
264 | |||
265 | $this->addToCrawlQueue($crawlUrl); |
||
266 | |||
267 | $this->depthTree = new Node((string) $this->baseUrl); |
||
268 | |||
269 | $this->startCrawlingQueue(); |
||
270 | |||
271 | foreach ($this->crawlObservers as $crawlObserver) { |
||
272 | $crawlObserver->finishedCrawling(); |
||
273 | } |
||
274 | } |
||
275 | |||
276 | protected function startCrawlingQueue() |
||
325 | |||
326 | public function endsWith($haystack, $needle) |
||
331 | |||
332 | protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string |
||
340 | |||
341 | protected function createRobotsTxt(UriInterface $uri): RobotsTxt |
||
345 | |||
346 | /** |
||
347 | * @param ResponseInterface|null $response |
||
348 | * @param CrawlUrl $crawlUrl |
||
349 | */ |
||
350 | protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl) |
||
360 | |||
361 | /** |
||
362 | * @param RequestException $exception |
||
363 | * @param CrawlUrl $crawlUrl |
||
364 | */ |
||
365 | protected function handleCrawlFailed(RequestException $exception, CrawlUrl $crawlUrl) |
||
375 | |||
376 | protected function getCrawlRequests(): Generator |
||
397 | |||
398 | protected function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl) |
||
435 | |||
436 | protected function shouldCrawl(Node $node): bool |
||
448 | |||
449 | /** |
||
450 | * @param string $html |
||
451 | * @param \Psr\Http\Message\UriInterface $foundOnUrl |
||
452 | * |
||
453 | * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null |
||
454 | */ |
||
455 | protected function extractAllLinks(string $html, UriInterface $foundOnUrl) |
||
476 | |||
477 | protected function normalizeUrl(UriInterface $url): UriInterface |
||
481 | |||
482 | protected function hasCrawlableScheme(UriInterface $uri): bool |
||
486 | |||
487 | protected function addtoDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl) |
||
509 | |||
510 | protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string |
||
518 | |||
519 | protected function getBrowsershot(): Browsershot |
||
529 | |||
530 | public function setBrowsershot(Browsershot $browsershot) |
||
536 | |||
537 | protected function addToCrawlQueue(CrawlUrl $crawlUrl) |
||
545 | |||
546 | protected function maximumCrawlCountReached(): bool |
||
554 | |||
555 | protected function mayIndex(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool |
||
563 | |||
564 | protected function mayFollow(RobotsHeaders $robotsHeaders, RobotsMeta $robotsMeta): bool |
||
572 | } |
||
573 |
This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.