Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
14 | class Crawler implements LoggerAwareInterface |
||
15 | { |
||
16 | /** |
||
17 | * @var Client |
||
18 | */ |
||
19 | private $client; |
||
20 | |||
21 | /** |
||
22 | * @var int |
||
23 | */ |
||
24 | private $limit = 0; |
||
25 | |||
26 | /** |
||
27 | * @var bool |
||
28 | */ |
||
29 | private $stopOnError = false; |
||
30 | |||
31 | /** |
||
32 | * @var bool |
||
33 | */ |
||
34 | private $exceptionOnError = false; |
||
35 | |||
36 | /** |
||
37 | * @var UrlMatcherInterface[] |
||
38 | */ |
||
39 | private $whitelistUrlMatchers = []; |
||
40 | |||
41 | /** |
||
42 | * @var UrlMatcherInterface[] |
||
43 | */ |
||
44 | private $blacklistUrlMatchers = []; |
||
45 | |||
46 | /** |
||
47 | * @var UrlNormalizerInterface[] |
||
48 | */ |
||
49 | private $urlNormalizers = []; |
||
50 | |||
51 | /** |
||
52 | * @var Url |
||
53 | */ |
||
54 | private $baseUrl; |
||
55 | |||
56 | /** |
||
57 | * @var array |
||
58 | */ |
||
59 | private $urlsCrawled = []; |
||
60 | |||
61 | /** |
||
62 | * @var array |
||
63 | */ |
||
64 | private $urlsQueued = []; |
||
65 | |||
66 | /** |
||
67 | * @var array |
||
68 | */ |
||
69 | private $urlsRejected = []; |
||
70 | |||
71 | /** |
||
72 | * @var array |
||
73 | */ |
||
74 | private $urlsReturned = []; |
||
75 | |||
76 | /** |
||
77 | * @var LoggerInterface |
||
78 | */ |
||
79 | private $logger = null; |
||
80 | |||
81 | /** |
||
82 | * @param Client $client |
||
83 | */ |
||
84 | 15 | public function __construct(Client $client = null) |
|
94 | |||
95 | /** |
||
96 | * @param Client $client |
||
97 | */ |
||
98 | 15 | public function setClient(Client $client) |
|
102 | |||
103 | /** |
||
104 | * @return Client |
||
105 | */ |
||
106 | 2 | public function getClient() |
|
110 | |||
111 | /** |
||
112 | * @return int |
||
113 | */ |
||
114 | 2 | public function getLimit() |
|
118 | |||
119 | /** |
||
120 | * @param int $limit |
||
121 | * @return $this |
||
122 | */ |
||
123 | 2 | public function setLimit($limit) |
|
129 | |||
130 | /** |
||
131 | * @return boolean |
||
132 | */ |
||
133 | 5 | public function getStopOnError() |
|
137 | |||
138 | /** |
||
139 | * @param boolean $stopOnError |
||
140 | * @return $this |
||
141 | */ |
||
142 | 2 | public function setStopOnError($stopOnError) |
|
148 | |||
149 | /** |
||
150 | * @return boolean |
||
151 | */ |
||
152 | 2 | public function getExceptionOnError() |
|
156 | |||
157 | /** |
||
158 | * @param boolean $exceptionOnError |
||
159 | * @return $this |
||
160 | */ |
||
161 | 1 | public function setExceptionOnError($exceptionOnError) |
|
167 | |||
168 | /** |
||
169 | * @return array |
||
170 | */ |
||
171 | 9 | public function getUrlsCrawled() |
|
175 | |||
176 | /** |
||
177 | * @return array |
||
178 | */ |
||
179 | 2 | public function getUrlsQueued() |
|
183 | |||
184 | /** |
||
185 | * @return array |
||
186 | */ |
||
187 | 2 | public function getUrlsRejected() |
|
191 | |||
192 | /** |
||
193 | * @return array |
||
194 | */ |
||
195 | 4 | public function getUrlsReturned() |
|
199 | |||
200 | /** |
||
201 | * @param $urlMatchers |
||
202 | * @return $this |
||
203 | */ |
||
204 | 1 | public function setWhitelistUrlMatchers(array $urlMatchers) |
|
213 | |||
214 | /** |
||
215 | * @return Url\Matcher\UrlMatcherInterface[] |
||
216 | */ |
||
217 | 2 | public function getWhitelistUrlMatchers() |
|
221 | |||
222 | /** |
||
223 | * @param UrlMatcherInterface $urlMatcher |
||
224 | * @return $this |
||
225 | */ |
||
226 | 2 | public function addWhitelistUrlMatcher(UrlMatcherInterface $urlMatcher) |
|
232 | |||
233 | /** |
||
234 | * @return $this |
||
235 | */ |
||
236 | 1 | public function clearWhitelistUrlMatchers() |
|
242 | |||
243 | /** |
||
244 | * @param array $urlMatchers |
||
245 | * @return $this |
||
246 | */ |
||
247 | 1 | public function setBlacklistUrlMatchers(array $urlMatchers) |
|
256 | |||
257 | /** |
||
258 | * @return Url\Matcher\UrlMatcherInterface[] |
||
259 | */ |
||
260 | 2 | public function getBlacklistUrlMatchers() |
|
264 | |||
265 | /** |
||
266 | * @param UrlMatcherInterface $urlMatcher |
||
267 | * @return $this |
||
268 | */ |
||
269 | 2 | public function addBlacklistUrlMatcher(UrlMatcherInterface $urlMatcher) |
|
275 | |||
276 | /** |
||
277 | * @return $this |
||
278 | */ |
||
279 | 1 | public function clearBlacklistUrlMatchers() |
|
285 | |||
286 | /** |
||
287 | * @param array $normalizers |
||
288 | * @return $this |
||
289 | */ |
||
290 | 1 | public function setUrlNormalizers(array $normalizers) |
|
300 | |||
301 | /** |
||
302 | * @return UrlNormalizerInterface[] |
||
303 | */ |
||
304 | 1 | public function getUrlNormalizers() |
|
308 | |||
309 | /** |
||
310 | * @param UrlNormalizerInterface $normalizer |
||
311 | * @return $this |
||
312 | */ |
||
313 | 2 | public function addUrlNormalizer(UrlNormalizerInterface $normalizer) |
|
319 | |||
320 | /** |
||
321 | * @return $this |
||
322 | */ |
||
323 | 1 | public function clearUrlNormalizers() |
|
329 | |||
330 | /** |
||
331 | * @return LoggerInterface |
||
332 | */ |
||
333 | 13 | public function getLogger() |
|
341 | |||
342 | /** |
||
343 | * @param LoggerInterface $logger |
||
344 | * @return $this |
||
345 | */ |
||
346 | 1 | public function setLogger(LoggerInterface $logger) |
|
352 | |||
353 | /** |
||
354 | * @param Url $url |
||
355 | */ |
||
356 | 11 | protected function addUrlToQueue(Url $url) |
|
360 | |||
361 | /** |
||
362 | * @param string $url |
||
363 | * @return Url |
||
364 | */ |
||
365 | 10 | protected function createHttpUrlString($url) |
|
369 | |||
370 | /** |
||
371 | * @param Url $url |
||
372 | */ |
||
373 | 11 | protected function reset(Url $url) |
|
381 | |||
382 | /** |
||
383 | * @param string $url |
||
384 | * @return \Generator|Page[] |
||
385 | * @throws RequestException |
||
386 | */ |
||
387 | 10 | public function crawl($url) |
|
429 | |||
430 | /** |
||
431 | * @param Url $url |
||
432 | * @return Url |
||
433 | */ |
||
434 | 10 | protected function updateUrl(Url $url) |
|
443 | |||
444 | /** |
||
445 | * @param DomCrawler $crawler |
||
446 | */ |
||
447 | 9 | protected function updateQueue(DomCrawler $crawler) |
|
465 | |||
466 | /** |
||
467 | * @param Url $url |
||
468 | * @return Url |
||
469 | */ |
||
470 | 7 | protected function normalizeUrl(Url $url) |
|
478 | |||
479 | /** |
||
480 | * @param Url $url |
||
481 | * @return bool |
||
482 | */ |
||
483 | 9 | protected function shouldReturnUrl(Url $url) |
|
501 | |||
502 | /** |
||
503 | * @param Url $url |
||
504 | * @return bool |
||
505 | */ |
||
506 | 1 | protected function isUrlWhitelisted(Url $url) |
|
516 | |||
517 | /** |
||
518 | * @param Url $url |
||
519 | * @return bool |
||
520 | */ |
||
521 | 9 | protected function isUrlBlacklisted(Url $url) |
|
531 | |||
532 | /** |
||
533 | * @param Url $url |
||
534 | * @return bool |
||
535 | */ |
||
536 | 8 | protected function shouldCrawlUrl(Url $url) |
|
553 | |||
554 | /** |
||
555 | * @param Url $url |
||
556 | * @return bool |
||
557 | */ |
||
558 | 8 | protected function isUrlRejected(Url $url) |
|
562 | |||
563 | /** |
||
564 | * @param Url $url |
||
565 | * @return bool |
||
566 | */ |
||
567 | 8 | protected function isUrlCrawled(Url $url) |
|
571 | |||
572 | /** |
||
573 | * @param Url $url |
||
574 | * @return bool |
||
575 | */ |
||
576 | 8 | protected function isUrlQueued(Url $url) |
|
580 | |||
581 | /** |
||
582 | * @param Url $url |
||
583 | * @return bool |
||
584 | */ |
||
585 | 8 | protected function isUrlPartOfBaseUrl(Url $url) |
|
595 | |||
596 | /** |
||
597 | * @return bool |
||
598 | */ |
||
599 | 9 | protected function isLimitReached() |
|
603 | |||
604 | /** |
||
605 | * @param DomCrawler $crawler |
||
606 | * @return array |
||
607 | */ |
||
608 | 9 | protected function extractUrlsFromCrawler(DomCrawler $crawler) |
|
616 | |||
617 | /** |
||
618 | * @param string $url |
||
619 | * @return DomCrawler |
||
620 | */ |
||
621 | 10 | protected function requestPage($url) |
|
629 | } |
||
630 |
Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code: