Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
20 | class Crawler |
||
21 | { |
||
22 | /** @var \GuzzleHttp\Client */ |
||
23 | protected $client; |
||
24 | |||
25 | /** @var \Psr\Http\Message\UriInterface */ |
||
26 | protected $baseUrl; |
||
27 | |||
28 | /** @var \Spatie\Crawler\CrawlObserverCollection */ |
||
29 | protected $crawlObservers; |
||
30 | |||
31 | /** @var \Spatie\Crawler\CrawlProfile */ |
||
32 | protected $crawlProfile; |
||
33 | |||
34 | /** @var int */ |
||
35 | protected $concurrency; |
||
36 | |||
37 | /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */ |
||
38 | protected $crawlQueue; |
||
39 | |||
40 | /** @var int */ |
||
41 | protected $crawledUrlCount = 0; |
||
42 | |||
43 | /** @var int|null */ |
||
44 | protected $maximumCrawlCount = null; |
||
45 | |||
46 | /** @var int */ |
||
47 | protected $maximumResponseSize = 1024 * 1024 * 2; |
||
48 | |||
49 | /** @var int|null */ |
||
50 | protected $maximumDepth = null; |
||
51 | |||
52 | /** @var bool */ |
||
53 | protected $respectRobots = true; |
||
54 | |||
55 | /** @var \Tree\Node\Node */ |
||
56 | protected $depthTree; |
||
57 | |||
58 | /** @var bool */ |
||
59 | protected $executeJavaScript = false; |
||
60 | |||
61 | /** @var Browsershot */ |
||
62 | protected $browsershot = null; |
||
63 | |||
64 | /** @var \Spatie\Robots\RobotsTxt */ |
||
65 | protected $robotsTxt = null; |
||
66 | |||
67 | protected static $defaultClientOptions = [ |
||
68 | RequestOptions::COOKIES => true, |
||
69 | RequestOptions::CONNECT_TIMEOUT => 10, |
||
70 | RequestOptions::TIMEOUT => 10, |
||
71 | RequestOptions::ALLOW_REDIRECTS => false, |
||
72 | ]; |
||
73 | |||
74 | /** |
||
75 | * @param array $clientOptions |
||
76 | * |
||
77 | * @return static |
||
78 | */ |
||
79 | public static function create(array $clientOptions = []) |
||
80 | { |
||
81 | $clientOptions = (count($clientOptions)) |
||
82 | ? $clientOptions |
||
83 | : static::$defaultClientOptions; |
||
84 | |||
85 | $client = new Client($clientOptions); |
||
86 | |||
87 | return new static($client); |
||
88 | } |
||
89 | |||
90 | public function __construct(Client $client, int $concurrency = 10) |
||
91 | { |
||
92 | $this->client = $client; |
||
93 | |||
94 | $this->concurrency = $concurrency; |
||
95 | |||
96 | $this->crawlProfile = new CrawlAllUrls(); |
||
97 | |||
98 | $this->crawlQueue = new CollectionCrawlQueue(); |
||
99 | |||
100 | $this->crawlObservers = new CrawlObserverCollection(); |
||
101 | } |
||
102 | |||
103 | public function setConcurrency(int $concurrency): self |
||
109 | |||
110 | public function setMaximumResponseSize(int $maximumResponseSizeInBytes): self |
||
111 | { |
||
112 | $this->maximumResponseSize = $maximumResponseSizeInBytes; |
||
113 | |||
114 | return $this; |
||
115 | } |
||
116 | |||
117 | public function getMaximumResponseSize(): ?int |
||
118 | { |
||
119 | return $this->maximumResponseSize; |
||
120 | } |
||
121 | |||
122 | public function setMaximumCrawlCount(int $maximumCrawlCount): self |
||
123 | { |
||
124 | $this->maximumCrawlCount = $maximumCrawlCount; |
||
125 | |||
126 | return $this; |
||
127 | } |
||
128 | |||
129 | public function getMaximumCrawlCount(): ?int |
||
130 | { |
||
131 | return $this->maximumCrawlCount; |
||
132 | } |
||
133 | |||
134 | public function getCrawlerUrlCount(): int |
||
135 | { |
||
136 | return $this->crawledUrlCount; |
||
137 | } |
||
138 | |||
139 | public function setMaximumDepth(int $maximumDepth): self |
||
140 | { |
||
141 | $this->maximumDepth = $maximumDepth; |
||
142 | |||
143 | return $this; |
||
144 | } |
||
145 | |||
146 | public function getMaximumDepth(): ?int |
||
147 | { |
||
148 | return $this->maximumDepth; |
||
149 | } |
||
150 | |||
151 | public function ignoreRobots(): self |
||
152 | { |
||
153 | $this->respectRobots = false; |
||
154 | |||
155 | return $this; |
||
156 | } |
||
157 | |||
158 | public function respectRobots(): self |
||
159 | { |
||
160 | $this->respectRobots = true; |
||
161 | |||
162 | return $this; |
||
163 | } |
||
164 | |||
165 | public function mustRespectRobots(): bool |
||
166 | { |
||
167 | return $this->respectRobots; |
||
168 | } |
||
169 | |||
170 | public function getRobotsTxt(): RobotsTxt |
||
171 | { |
||
172 | return $this->robotsTxt; |
||
173 | } |
||
174 | |||
175 | public function setCrawlQueue(CrawlQueue $crawlQueue): self |
||
181 | |||
182 | public function getCrawlQueue(): CrawlQueue |
||
183 | { |
||
184 | return $this->crawlQueue; |
||
185 | } |
||
186 | |||
187 | public function executeJavaScript(): self |
||
188 | { |
||
189 | $this->executeJavaScript = true; |
||
190 | |||
191 | return $this; |
||
192 | } |
||
193 | |||
194 | public function doNotExecuteJavaScript(): self |
||
200 | |||
201 | public function mayExecuteJavascript(): bool |
||
202 | { |
||
203 | return $this->executeJavaScript; |
||
204 | } |
||
205 | |||
206 | /** |
||
207 | * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers |
||
|
|||
208 | * |
||
209 | * @return $this |
||
210 | */ |
||
211 | public function setCrawlObserver($crawlObservers): self |
||
212 | { |
||
213 | if (! is_array($crawlObservers)) { |
||
214 | $crawlObservers = [$crawlObservers]; |
||
215 | } |
||
219 | |||
220 | public function setCrawlObservers(array $crawlObservers): self |
||
226 | |||
227 | public function addCrawlObserver(CrawlObserver $crawlObserver): self |
||
233 | |||
234 | public function getCrawlObservers(): CrawlObserverCollection |
||
238 | |||
239 | public function setCrawlProfile(CrawlProfile $crawlProfile): self |
||
245 | |||
246 | public function getCrawlProfile(): CrawlProfile |
||
250 | |||
251 | public function setBrowsershot(Browsershot $browsershot) |
||
257 | |||
258 | public function getBrowsershot(): Browsershot |
||
266 | |||
267 | public function getBaseUrl(): UriInterface |
||
271 | |||
272 | /** |
||
273 | * @param \Psr\Http\Message\UriInterface|string $baseUrl |
||
274 | */ |
||
275 | public function startCrawling($baseUrl) |
||
309 | |||
310 | public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node |
||
334 | |||
335 | protected function startCrawlingQueue() |
||
350 | |||
351 | /** |
||
352 | * @deprecated This function will be removed in the next major version |
||
353 | */ |
||
354 | public function endsWith($haystack, $needle) |
||
359 | |||
360 | protected function createRobotsTxt(UriInterface $uri): RobotsTxt |
||
364 | |||
365 | protected function getCrawlRequests(): Generator |
||
386 | |||
387 | public function addToCrawlQueue(CrawlUrl $crawlUrl): self |
||
403 | |||
404 | public function maximumCrawlCountReached(): bool |
||
414 | } |
||
415 |
This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.