Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
18 | class Crawler implements LoggerAwareInterface |
||
19 | { |
||
20 | /** |
||
21 | * @var Client |
||
22 | */ |
||
23 | private $client; |
||
24 | |||
25 | /** |
||
26 | * @var int |
||
27 | */ |
||
28 | private $limit = 0; |
||
29 | |||
30 | /** |
||
31 | * @var bool |
||
32 | */ |
||
33 | private $stopOnError = false; |
||
34 | |||
35 | /** |
||
36 | * @var bool |
||
37 | */ |
||
38 | private $exceptionOnError = false; |
||
39 | |||
40 | /** |
||
41 | * @var UrlMatcherInterface[] |
||
42 | */ |
||
43 | private $whitelistUrlMatchers = []; |
||
44 | |||
45 | /** |
||
46 | * @var UrlMatcherInterface[] |
||
47 | */ |
||
48 | private $blacklistUrlMatchers = []; |
||
49 | |||
50 | /** |
||
51 | * @var UrlNormalizerInterface[] |
||
52 | */ |
||
53 | private $urlNormalizers = []; |
||
54 | |||
55 | /** |
||
56 | * @var Url |
||
57 | */ |
||
58 | private $baseUrl; |
||
59 | |||
60 | /** |
||
61 | * @var UrlCollection |
||
62 | */ |
||
63 | private $urlsCrawled; |
||
64 | |||
65 | /** |
||
66 | * @var UrlCollection |
||
67 | */ |
||
68 | private $urlsQueued; |
||
69 | |||
70 | /** |
||
71 | * @var UrlCollection |
||
72 | */ |
||
73 | private $urlsReturned; |
||
74 | |||
75 | /** |
||
76 | * @var array |
||
77 | */ |
||
78 | private $urlsRejected = []; |
||
79 | |||
80 | /** |
||
81 | * @var LoggerInterface |
||
82 | */ |
||
83 | private $logger = null; |
||
84 | |||
85 | /** |
||
86 | * @param CrawlerClientInterface $client |
||
87 | */ |
||
88 | 17 | public function __construct(CrawlerClientInterface $client = null) |
|
89 | { |
||
90 | 17 | if (empty($client)) { |
|
91 | 7 | $client = new GoutteClient(); |
|
92 | 7 | } |
|
93 | |||
94 | 17 | $this->setClient($client); |
|
95 | |||
96 | 17 | $this->urlsCrawled = new UrlCollection(); |
|
97 | 17 | $this->urlsQueued = new UrlCollection(); |
|
98 | 17 | $this->urlsReturned = new UrlCollection(); |
|
99 | |||
100 | 17 | return $this; |
|
|
|||
101 | } |
||
102 | |||
103 | /** |
||
104 | * @param CrawlerClientInterface $client |
||
105 | */ |
||
106 | 17 | public function setClient(CrawlerClientInterface $client) |
|
107 | { |
||
108 | 17 | $this->client = $client; |
|
109 | 17 | } |
|
110 | |||
111 | /** |
||
112 | * @return Client |
||
113 | */ |
||
114 | 2 | public function getClient() |
|
115 | { |
||
116 | 2 | return $this->client; |
|
117 | } |
||
118 | |||
119 | /** |
||
120 | * @return int |
||
121 | */ |
||
122 | 2 | public function getLimit() |
|
123 | { |
||
124 | 2 | return $this->limit; |
|
125 | } |
||
126 | |||
127 | /** |
||
128 | * @param int $limit |
||
129 | * @return $this |
||
130 | */ |
||
131 | 2 | public function setLimit($limit) |
|
132 | { |
||
133 | 2 | $this->limit = $limit; |
|
134 | |||
135 | 2 | return $this; |
|
136 | } |
||
137 | |||
138 | /** |
||
139 | * @return boolean |
||
140 | */ |
||
141 | 5 | public function getStopOnError() |
|
142 | { |
||
143 | 5 | return $this->stopOnError; |
|
144 | } |
||
145 | |||
146 | /** |
||
147 | * @param boolean $stopOnError |
||
148 | * @return $this |
||
149 | */ |
||
150 | 2 | public function setStopOnError($stopOnError) |
|
151 | { |
||
152 | 2 | $this->stopOnError = $stopOnError; |
|
153 | |||
154 | 2 | return $this; |
|
155 | } |
||
156 | |||
157 | /** |
||
158 | * @return boolean |
||
159 | */ |
||
160 | 2 | public function getExceptionOnError() |
|
161 | { |
||
162 | 2 | return $this->exceptionOnError; |
|
163 | } |
||
164 | |||
165 | /** |
||
166 | * @param boolean $exceptionOnError |
||
167 | * @return $this |
||
168 | */ |
||
169 | 1 | public function setExceptionOnError($exceptionOnError) |
|
170 | { |
||
171 | 1 | $this->exceptionOnError = $exceptionOnError; |
|
172 | |||
173 | 1 | return $this; |
|
174 | } |
||
175 | |||
176 | /** |
||
177 | * @return array |
||
178 | */ |
||
179 | 9 | public function getUrlsCrawled() |
|
180 | { |
||
181 | 9 | return $this->urlsCrawled->toArray(); |
|
182 | } |
||
183 | |||
184 | /** |
||
185 | * @return array |
||
186 | */ |
||
187 | 2 | public function getUrlsQueued() |
|
188 | { |
||
189 | 2 | return $this->urlsQueued->toArray(); |
|
190 | } |
||
191 | |||
192 | /** |
||
193 | * @return array |
||
194 | */ |
||
195 | 4 | public function getUrlsReturned() |
|
199 | |||
200 | /** |
||
201 | * @return array |
||
202 | */ |
||
203 | 3 | public function getUrlsRejected() |
|
204 | { |
||
205 | 3 | return $this->urlsRejected; |
|
206 | } |
||
207 | |||
208 | /** |
||
209 | * @param $urlMatchers |
||
210 | * @return $this |
||
211 | */ |
||
212 | 1 | public function setWhitelistUrlMatchers(array $urlMatchers) |
|
221 | |||
222 | /** |
||
223 | * @return Url\Matcher\UrlMatcherInterface[] |
||
224 | */ |
||
225 | 2 | public function getWhitelistUrlMatchers() |
|
229 | |||
230 | /** |
||
231 | * @param UrlMatcherInterface $urlMatcher |
||
232 | * @return $this |
||
233 | */ |
||
234 | 2 | public function addWhitelistUrlMatcher(UrlMatcherInterface $urlMatcher) |
|
240 | |||
241 | /** |
||
242 | * @return $this |
||
243 | */ |
||
244 | 1 | public function clearWhitelistUrlMatchers() |
|
250 | |||
251 | /** |
||
252 | * @param array $urlMatchers |
||
253 | * @return $this |
||
254 | */ |
||
255 | 1 | public function setBlacklistUrlMatchers(array $urlMatchers) |
|
264 | |||
265 | /** |
||
266 | * @return UrlMatcherInterface[] |
||
267 | */ |
||
268 | 2 | public function getBlacklistUrlMatchers() |
|
272 | |||
273 | /** |
||
274 | * @param UrlMatcherInterface $urlMatcher |
||
275 | * @return $this |
||
276 | */ |
||
277 | 2 | public function addBlacklistUrlMatcher(UrlMatcherInterface $urlMatcher) |
|
283 | |||
284 | /** |
||
285 | * @return $this |
||
286 | */ |
||
287 | 1 | public function clearBlacklistUrlMatchers() |
|
293 | |||
294 | /** |
||
295 | * @param array $normalizers |
||
296 | * @return $this |
||
297 | */ |
||
298 | 1 | public function setUrlNormalizers(array $normalizers) |
|
308 | |||
309 | /** |
||
310 | * @return UrlNormalizerInterface[] |
||
311 | */ |
||
312 | 1 | public function getUrlNormalizers() |
|
316 | |||
317 | /** |
||
318 | * @param UrlNormalizerInterface $normalizer |
||
319 | * @return $this |
||
320 | */ |
||
321 | 2 | public function addUrlNormalizer(UrlNormalizerInterface $normalizer) |
|
327 | |||
328 | /** |
||
329 | * @return $this |
||
330 | */ |
||
331 | 1 | public function clearUrlNormalizers() |
|
337 | |||
338 | /** |
||
339 | * @return LoggerInterface |
||
340 | */ |
||
341 | 13 | public function getLogger() |
|
349 | |||
350 | /** |
||
351 | * @param LoggerInterface $logger |
||
352 | * @return $this |
||
353 | */ |
||
354 | 1 | public function setLogger(LoggerInterface $logger) |
|
360 | |||
361 | /** |
||
362 | * @param $url |
||
363 | * @return Url |
||
364 | * @throws \Exception |
||
365 | */ |
||
366 | 10 | protected function createHttpUrlString($url) |
|
379 | |||
380 | /** |
||
381 | * @param Url $url |
||
382 | */ |
||
383 | 11 | protected function reset(Url $url) |
|
394 | |||
395 | /** |
||
396 | * @param string $url |
||
397 | * @return \Generator|Page[] |
||
398 | * @throws RequestException |
||
399 | */ |
||
400 | 10 | public function crawl($url) |
|
439 | |||
440 | /** |
||
441 | * @param Url $url |
||
442 | * @return Url |
||
443 | */ |
||
444 | 10 | protected function updateResolvedUrl(Url $url) |
|
453 | |||
454 | /** |
||
455 | * @param DomCrawler $crawler |
||
456 | */ |
||
457 | 9 | protected function updateQueue(DomCrawler $crawler) |
|
472 | |||
473 | /** |
||
474 | * @param Url $url |
||
475 | * @return Url |
||
476 | */ |
||
477 | 10 | protected function normalizeUrl(Url $url) |
|
485 | |||
486 | /** |
||
487 | * @param Url $url |
||
488 | * @return bool |
||
489 | */ |
||
490 | 9 | protected function shouldReturnUrl(Url $url) |
|
508 | |||
509 | /** |
||
510 | * @param Url $url |
||
511 | * @return bool |
||
512 | */ |
||
513 | 1 | protected function isUrlWhitelisted(Url $url) |
|
523 | |||
524 | /** |
||
525 | * @param Url $url |
||
526 | * @return bool |
||
527 | */ |
||
528 | 9 | protected function isUrlBlacklisted(Url $url) |
|
538 | |||
539 | /** |
||
540 | * @param Url $url |
||
541 | * @return bool |
||
542 | */ |
||
543 | 8 | protected function shouldCrawlUrl(Url $url) |
|
557 | |||
558 | /** |
||
559 | * @param $url |
||
560 | */ |
||
561 | 9 | protected function addRejectedUrl($url) |
|
562 | { |
||
563 | 9 | if ($url instanceof Url) { |
|
564 | 8 | $url = $url->__toString(); |
|
565 | 8 | } |
|
566 | 9 | if (!is_string($url)) { |
|
567 | 1 | throw new \InvalidArgumentException('Url should be a string or an instance of '.Url::class); |
|
568 | } |
||
569 | |||
570 | 8 | $this->urlsRejected[$url] = $url; |
|
571 | 8 | } |
|
572 | |||
573 | /** |
||
574 | * @param Url $url |
||
575 | * @return bool |
||
576 | */ |
||
577 | 8 | protected function isUrlPartOfBaseUrl(Url $url) |
|
587 | |||
588 | /** |
||
589 | * @return bool |
||
590 | */ |
||
591 | 9 | protected function isLimitReached() |
|
595 | |||
596 | /** |
||
597 | * @param DomCrawler $crawler |
||
598 | * @return array |
||
599 | */ |
||
600 | 9 | protected function extractUrlsFromCrawler(DomCrawler $crawler) |
|
608 | |||
609 | /** |
||
610 | * @param Url $url |
||
611 | * @return DomCrawler |
||
612 | */ |
||
613 | 10 | protected function requestPage(Url $url) |
|
621 | } |
||
622 |