Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
14 | class Crawler implements LoggerAwareInterface |
||
15 | { |
||
16 | /** |
||
17 | * @var Client |
||
18 | */ |
||
19 | private $client; |
||
20 | |||
21 | /** |
||
22 | * @var int |
||
23 | */ |
||
24 | private $limit = 0; |
||
25 | |||
26 | /** |
||
27 | * @var bool |
||
28 | */ |
||
29 | private $stopOnError = false; |
||
30 | |||
31 | /** |
||
32 | * @var bool |
||
33 | */ |
||
34 | private $exceptionOnError = false; |
||
35 | |||
36 | /** |
||
37 | * @var UrlMatcherInterface[] |
||
38 | */ |
||
39 | private $whitelistUrlMatchers = []; |
||
40 | |||
41 | /** |
||
42 | * @var UrlMatcherInterface[] |
||
43 | */ |
||
44 | private $blacklistUrlMatchers = []; |
||
45 | |||
46 | /** |
||
47 | * @var UrlNormalizerInterface[] |
||
48 | */ |
||
49 | private $urlNormalizers = []; |
||
50 | |||
51 | /** |
||
52 | * @var Url |
||
53 | */ |
||
54 | private $baseUrl; |
||
55 | |||
56 | /** |
||
57 | * @var array |
||
58 | */ |
||
59 | private $urlsCrawled = []; |
||
60 | |||
61 | /** |
||
62 | * @var array |
||
63 | */ |
||
64 | private $urlsQueued = []; |
||
65 | |||
66 | /** |
||
67 | * @var array |
||
68 | */ |
||
69 | private $urlsRejected = []; |
||
70 | |||
71 | /** |
||
72 | * @var array |
||
73 | */ |
||
74 | private $urlsReturned = []; |
||
75 | |||
76 | /** |
||
77 | * @var LoggerInterface |
||
78 | */ |
||
79 | private $logger = null; |
||
80 | |||
81 | /** |
||
82 | * @param Client $client |
||
83 | */ |
||
84 | 13 | public function __construct(Client $client = null) |
|
94 | |||
95 | /** |
||
96 | * @param Client $client |
||
97 | */ |
||
98 | 13 | public function setClient(Client $client) |
|
102 | |||
103 | /** |
||
104 | * @return Client |
||
105 | */ |
||
106 | 2 | public function getClient() |
|
110 | |||
111 | /** |
||
112 | * @return int |
||
113 | */ |
||
114 | 2 | public function getLimit() |
|
118 | |||
119 | /** |
||
120 | * @param int $limit |
||
121 | * @return $this |
||
122 | */ |
||
123 | 2 | public function setLimit($limit) |
|
129 | |||
130 | /** |
||
131 | * @return boolean |
||
132 | */ |
||
133 | 5 | public function getStopOnError() |
|
137 | |||
138 | /** |
||
139 | * @param boolean $stopOnError |
||
140 | * @return $this |
||
141 | */ |
||
142 | 2 | public function setStopOnError($stopOnError) |
|
148 | |||
149 | /** |
||
150 | * @return boolean |
||
151 | */ |
||
152 | 2 | public function getExceptionOnError() |
|
156 | |||
157 | /** |
||
158 | * @param boolean $exceptionOnError |
||
159 | * @return $this |
||
160 | */ |
||
161 | 1 | public function setExceptionOnError($exceptionOnError) |
|
167 | |||
168 | /** |
||
169 | * @return array |
||
170 | */ |
||
171 | 9 | public function getUrlsCrawled() |
|
175 | |||
176 | /** |
||
177 | * @return array |
||
178 | */ |
||
179 | 2 | public function getUrlsQueued() |
|
183 | |||
184 | /** |
||
185 | * @return array |
||
186 | */ |
||
187 | 2 | public function getUrlsRejected() |
|
191 | |||
192 | /** |
||
193 | * @return array |
||
194 | */ |
||
195 | 4 | public function getUrlsReturned() |
|
199 | |||
200 | /** |
||
201 | * @param $urlMatchers |
||
202 | * @return $this |
||
203 | */ |
||
204 | 1 | public function setWhitelistUrlMatchers(array $urlMatchers) |
|
213 | |||
214 | /** |
||
215 | * @return Url\Matcher\UrlMatcherInterface[] |
||
216 | */ |
||
217 | 2 | public function getWhitelistUrlMatchers() |
|
221 | |||
222 | /** |
||
223 | * @param UrlMatcherInterface $urlMatcher |
||
224 | * @return $this |
||
225 | */ |
||
226 | 2 | public function addWhitelistUrlMatcher(UrlMatcherInterface $urlMatcher) |
|
232 | |||
233 | /** |
||
234 | * @return $this |
||
235 | */ |
||
236 | 1 | public function clearWhitelistUrlMatchers() |
|
242 | |||
243 | /** |
||
244 | * @param array $urlMatchers |
||
245 | * @return $this |
||
246 | */ |
||
247 | 1 | public function setBlacklistUrlMatchers(array $urlMatchers) |
|
256 | |||
257 | /** |
||
258 | * @return Url\Matcher\UrlMatcherInterface[] |
||
259 | */ |
||
260 | 2 | public function getBlacklistUrlMatchers() |
|
264 | |||
265 | /** |
||
266 | * @param UrlMatcherInterface $urlMatcher |
||
267 | * @return $this |
||
268 | */ |
||
269 | 2 | public function addBlacklistUrlMatcher(UrlMatcherInterface $urlMatcher) |
|
275 | |||
276 | /** |
||
277 | * @return $this |
||
278 | */ |
||
279 | 1 | public function clearBlacklistUrlMatchers() |
|
285 | |||
286 | /** |
||
287 | * @param array $normalizers |
||
288 | * @return $this |
||
289 | */ |
||
290 | 1 | public function setUrlNormalizers(array $normalizers) |
|
300 | |||
301 | /** |
||
302 | * @return UrlNormalizerInterface[] |
||
303 | */ |
||
304 | 1 | public function getUrlNormalizers() |
|
308 | |||
309 | /** |
||
310 | * @param UrlNormalizerInterface $normalizer |
||
311 | * @return $this |
||
312 | */ |
||
313 | 2 | public function addUrlNormalizer(UrlNormalizerInterface $normalizer) |
|
319 | |||
320 | /** |
||
321 | * @return $this |
||
322 | */ |
||
323 | 1 | public function clearUrlNormalizers() |
|
329 | |||
330 | /** |
||
331 | * @return LoggerInterface |
||
332 | */ |
||
333 | 11 | public function getLogger() |
|
341 | |||
342 | /** |
||
343 | * @param LoggerInterface $logger |
||
344 | * @return $this |
||
345 | */ |
||
346 | 1 | public function setLogger(LoggerInterface $logger) |
|
352 | |||
353 | /** |
||
354 | * @param Url $url |
||
355 | */ |
||
356 | 9 | protected function addUrlToQueue(Url $url) |
|
360 | |||
361 | /** |
||
362 | * @param string $url |
||
363 | * @return Url |
||
364 | */ |
||
365 | 9 | protected function createHttpUrlString($url) |
|
369 | |||
370 | /** |
||
371 | * @param Url $url |
||
372 | */ |
||
373 | 9 | protected function reset(Url $url) |
|
381 | |||
382 | /** |
||
383 | * @param string $url |
||
384 | * @return \Generator |
||
385 | * @throws RequestException |
||
386 | */ |
||
387 | 9 | public function crawl($url) |
|
428 | |||
429 | /** |
||
430 | * @param DomCrawler $crawler |
||
431 | */ |
||
432 | 9 | protected function updateQueue(DomCrawler $crawler) |
|
450 | |||
451 | /** |
||
452 | * @param Url $url |
||
453 | * @return Url |
||
454 | */ |
||
455 | 8 | protected function normalizeUrl(Url $url) |
|
463 | |||
464 | /** |
||
465 | * @param Url $url |
||
466 | * @return bool |
||
467 | */ |
||
468 | 9 | protected function shouldReturnUrl(Url $url) |
|
486 | |||
487 | /** |
||
488 | * @param Url $url |
||
489 | * @return bool |
||
490 | */ |
||
491 | 1 | protected function isUrlWhitelisted(Url $url) |
|
501 | |||
502 | /** |
||
503 | * @param Url $url |
||
504 | * @return bool |
||
505 | */ |
||
506 | 9 | protected function isUrlBlacklisted(Url $url) |
|
516 | |||
517 | /** |
||
518 | * @param Url $url |
||
519 | * @return bool |
||
520 | */ |
||
521 | 8 | protected function shouldCrawlUrl(Url $url) |
|
535 | |||
536 | /** |
||
537 | * @param Url $url |
||
538 | * @return bool |
||
539 | */ |
||
540 | 8 | protected function isUrlRejected(Url $url) |
|
544 | |||
545 | /** |
||
546 | * @param Url $url |
||
547 | * @return bool |
||
548 | */ |
||
549 | 8 | protected function isUrlCrawled(Url $url) |
|
553 | |||
554 | /** |
||
555 | * @param Url $url |
||
556 | * @return bool |
||
557 | */ |
||
558 | 8 | protected function isUrlQueued(Url $url) |
|
562 | |||
563 | /** |
||
564 | * @param Url $url |
||
565 | * @return bool |
||
566 | */ |
||
567 | 8 | protected function isUrlPartOfBaseUrl(Url $url) |
|
577 | |||
578 | /** |
||
579 | * @return bool |
||
580 | */ |
||
581 | 9 | protected function isLimitReached() |
|
585 | |||
586 | /** |
||
587 | * @param DomCrawler $crawler |
||
588 | * @return array |
||
589 | */ |
||
590 | 9 | protected function extractUrlsFromCrawler(DomCrawler $crawler) |
|
598 | |||
599 | /** |
||
600 | * @param string $url |
||
601 | * @return DomCrawler |
||
602 | */ |
||
603 | 9 | protected function requestPage($url) |
|
611 | } |
||
612 |