Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
14 | class Crawler implements LoggerAwareInterface |
||
15 | { |
||
16 | /** |
||
17 | * @var Client |
||
18 | */ |
||
19 | private $client; |
||
20 | |||
21 | /** |
||
22 | * @var int |
||
23 | */ |
||
24 | private $limit = 0; |
||
25 | |||
26 | /** |
||
27 | * @var bool |
||
28 | */ |
||
29 | private $stopOnError = false; |
||
30 | |||
31 | /** |
||
32 | * @var bool |
||
33 | */ |
||
34 | private $exceptionOnError = false; |
||
35 | |||
36 | /** |
||
37 | * @var UrlMatcherInterface[] |
||
38 | */ |
||
39 | private $whitelistUrlMatchers = []; |
||
40 | |||
41 | /** |
||
42 | * @var UrlMatcherInterface[] |
||
43 | */ |
||
44 | private $blacklistUrlMatchers = []; |
||
45 | |||
46 | /** |
||
47 | * @var UrlNormalizerInterface[] |
||
48 | */ |
||
49 | private $urlNormalizers = []; |
||
50 | |||
51 | /** |
||
52 | * @var Url |
||
53 | */ |
||
54 | private $baseUrl; |
||
55 | |||
56 | /** |
||
57 | * @var array |
||
58 | */ |
||
59 | private $urlsCrawled = []; |
||
60 | |||
61 | /** |
||
62 | * @var array |
||
63 | */ |
||
64 | private $urlsQueued = []; |
||
65 | |||
66 | /** |
||
67 | * @var array |
||
68 | */ |
||
69 | private $urlsRejected = []; |
||
70 | |||
71 | /** |
||
72 | * @var array |
||
73 | */ |
||
74 | private $urlsReturned = []; |
||
75 | |||
76 | /** |
||
77 | * @var LoggerInterface |
||
78 | */ |
||
79 | private $logger = null; |
||
80 | |||
81 | /** |
||
82 | * @param Client $client |
||
83 | * @param array $options |
||
84 | */ |
||
85 | 14 | public function __construct(Client $client = null, array $options = []) |
|
96 | |||
97 | /** |
||
98 | * @param Client $client |
||
99 | */ |
||
100 | 14 | public function setClient(Client $client) |
|
104 | |||
105 | /** |
||
106 | * @return Client |
||
107 | */ |
||
108 | 2 | public function getClient() |
|
112 | |||
113 | /** |
||
114 | * @param array $options |
||
115 | */ |
||
116 | 14 | public function setOptions(array $options) |
|
140 | |||
141 | /** |
||
142 | * @return int |
||
143 | */ |
||
144 | 3 | public function getLimit() |
|
148 | |||
149 | /** |
||
150 | * @param int $limit |
||
151 | * @return $this |
||
152 | */ |
||
153 | 3 | public function setLimit($limit) |
|
159 | |||
160 | /** |
||
161 | * @return boolean |
||
162 | */ |
||
163 | 6 | public function getStopOnError() |
|
167 | |||
168 | /** |
||
169 | * @param boolean $stopOnError |
||
170 | * @return $this |
||
171 | */ |
||
172 | 3 | public function setStopOnError($stopOnError) |
|
178 | |||
179 | /** |
||
180 | * @return boolean |
||
181 | */ |
||
182 | 3 | public function getExceptionOnError() |
|
186 | |||
187 | /** |
||
188 | * @param boolean $exceptionOnError |
||
189 | * @return $this |
||
190 | */ |
||
191 | 2 | public function setExceptionOnError($exceptionOnError) |
|
197 | |||
198 | /** |
||
199 | * @return array |
||
200 | */ |
||
201 | 9 | public function getUrlsCrawled() |
|
205 | |||
206 | /** |
||
207 | * @return array |
||
208 | */ |
||
209 | 2 | public function getUrlsQueued() |
|
213 | |||
214 | /** |
||
215 | * @return array |
||
216 | */ |
||
217 | 2 | public function getUrlsRejected() |
|
221 | |||
222 | /** |
||
223 | * @return array |
||
224 | */ |
||
225 | 4 | public function getUrlsReturned() |
|
229 | |||
230 | /** |
||
231 | * @param $urlMatchers |
||
232 | * @return $this |
||
233 | */ |
||
234 | 2 | public function setWhitelistUrlMatchers(array $urlMatchers) |
|
243 | |||
244 | /** |
||
245 | * @return Url\Matcher\UrlMatcherInterface[] |
||
246 | */ |
||
247 | 3 | public function getWhitelistUrlMatchers() |
|
251 | |||
252 | /** |
||
253 | * @param UrlMatcherInterface $urlMatcher |
||
254 | * @return $this |
||
255 | */ |
||
256 | 3 | public function addWhitelistUrlMatcher(UrlMatcherInterface $urlMatcher) |
|
262 | |||
263 | /** |
||
264 | * @return $this |
||
265 | */ |
||
266 | 2 | public function clearWhitelistUrlMatchers() |
|
272 | |||
273 | /** |
||
274 | * @param array $urlMatchers |
||
275 | * @return $this |
||
276 | */ |
||
277 | 2 | public function setBlacklistUrlMatchers(array $urlMatchers) |
|
286 | |||
287 | /** |
||
288 | * @return Url\Matcher\UrlMatcherInterface[] |
||
289 | */ |
||
290 | 3 | public function getBlacklistUrlMatchers() |
|
294 | |||
295 | /** |
||
296 | * @param UrlMatcherInterface $urlMatcher |
||
297 | * @return $this |
||
298 | */ |
||
299 | 3 | public function addBlacklistUrlMatcher(UrlMatcherInterface $urlMatcher) |
|
305 | |||
306 | /** |
||
307 | * @return $this |
||
308 | */ |
||
309 | 2 | public function clearBlacklistUrlMatchers() |
|
315 | |||
316 | /** |
||
317 | * @param array $normalizers |
||
318 | * @return $this |
||
319 | */ |
||
320 | 2 | public function setUrlNormalizers(array $normalizers) |
|
330 | |||
331 | /** |
||
332 | * @return UrlNormalizerInterface[] |
||
333 | */ |
||
334 | 2 | public function getUrlNormalizers() |
|
338 | |||
339 | /** |
||
340 | * @param UrlNormalizerInterface $normalizer |
||
341 | * @return $this |
||
342 | */ |
||
343 | 3 | public function addUrlNormalizer(UrlNormalizerInterface $normalizer) |
|
349 | |||
350 | /** |
||
351 | * @return $this |
||
352 | */ |
||
353 | 2 | public function clearUrlNormalizers() |
|
359 | |||
360 | /** |
||
361 | * @return LoggerInterface |
||
362 | */ |
||
363 | 12 | public function getLogger() |
|
371 | |||
372 | /** |
||
373 | * @param LoggerInterface $logger |
||
374 | * @return $this |
||
375 | */ |
||
376 | 2 | public function setLogger(LoggerInterface $logger) |
|
382 | |||
383 | /** |
||
384 | * @param Url $url |
||
385 | */ |
||
386 | 9 | protected function addUrlToQueue(Url $url) |
|
390 | |||
391 | /** |
||
392 | * @param string $url |
||
393 | * @return Url |
||
394 | */ |
||
395 | 9 | protected function createHttpUrlString($url) |
|
399 | |||
400 | /** |
||
401 | * @param Url $url |
||
402 | */ |
||
403 | 9 | protected function reset(Url $url) |
|
411 | |||
412 | /** |
||
413 | * @param string $url |
||
414 | * @return \Generator |
||
415 | * @throws RequestException |
||
416 | */ |
||
417 | 9 | public function crawl($url) |
|
458 | |||
459 | /** |
||
460 | * @param DomCrawler $crawler |
||
461 | */ |
||
462 | 9 | protected function updateQueue(DomCrawler $crawler) |
|
480 | |||
481 | /** |
||
482 | * @param Url $url |
||
483 | * @return Url |
||
484 | */ |
||
485 | 8 | protected function normalizeUrl(Url $url) |
|
493 | |||
494 | /** |
||
495 | * @param Url $url |
||
496 | * @return bool |
||
497 | */ |
||
498 | 9 | protected function shouldReturnUrl(Url $url) |
|
516 | |||
517 | /** |
||
518 | * @param Url $url |
||
519 | * @return bool |
||
520 | */ |
||
521 | 1 | protected function isUrlWhitelisted(Url $url) |
|
531 | |||
532 | /** |
||
533 | * @param Url $url |
||
534 | * @return bool |
||
535 | */ |
||
536 | 9 | protected function isUrlBlacklisted(Url $url) |
|
546 | |||
547 | /** |
||
548 | * @param Url $url |
||
549 | * @return bool |
||
550 | */ |
||
551 | 8 | protected function shouldCrawlUrl(Url $url) |
|
565 | |||
566 | /** |
||
567 | * @param Url $url |
||
568 | * @return bool |
||
569 | */ |
||
570 | 8 | protected function isUrlRejected(Url $url) |
|
574 | |||
575 | /** |
||
576 | * @param Url $url |
||
577 | * @return bool |
||
578 | */ |
||
579 | 8 | protected function isUrlCrawled(Url $url) |
|
583 | |||
584 | /** |
||
585 | * @param Url $url |
||
586 | * @return bool |
||
587 | */ |
||
588 | 8 | protected function isUrlQueued(Url $url) |
|
592 | |||
593 | /** |
||
594 | * @param Url $url |
||
595 | * @return bool |
||
596 | */ |
||
597 | 8 | protected function isUrlPartOfBaseUrl(Url $url) |
|
607 | |||
608 | /** |
||
609 | * @return bool |
||
610 | */ |
||
611 | 9 | protected function isLimitReached() |
|
615 | |||
616 | /** |
||
617 | * @param DomCrawler $crawler |
||
618 | * @return array |
||
619 | */ |
||
620 | 9 | protected function extractUrlsFromCrawler(DomCrawler $crawler) |
|
628 | |||
629 | /** |
||
630 | * @param string $url |
||
631 | * @return DomCrawler |
||
632 | */ |
||
633 | 9 | protected function requestPage($url) |
|
641 | } |
||
642 |