Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
13 | class Crawler implements LoggerAwareInterface |
||
14 | { |
||
15 | /** |
||
16 | * @var Client |
||
17 | */ |
||
18 | private $client; |
||
19 | |||
20 | /** |
||
21 | * @var int |
||
22 | */ |
||
23 | private $limit = 0; |
||
24 | |||
25 | /** |
||
26 | * @var bool |
||
27 | */ |
||
28 | private $stopOnError = false; |
||
29 | |||
30 | /** |
||
31 | * @var UrlMatcherInterface[] |
||
32 | */ |
||
33 | private $whitelistUrlMatchers = []; |
||
34 | |||
35 | /** |
||
36 | * @var UrlMatcherInterface[] |
||
37 | */ |
||
38 | private $blacklistUrlMatchers = []; |
||
39 | |||
40 | /** |
||
41 | * @var UrlNormalizerInterface[] |
||
42 | */ |
||
43 | private $urlNormalizers = []; |
||
44 | |||
45 | /** |
||
46 | * @var Url |
||
47 | */ |
||
48 | private $baseUrl; |
||
49 | |||
50 | /** |
||
51 | * @var array |
||
52 | */ |
||
53 | private $urlsCrawled = []; |
||
54 | |||
55 | /** |
||
56 | * @var array |
||
57 | */ |
||
58 | private $urlsQueued = []; |
||
59 | |||
60 | /** |
||
61 | * @var array |
||
62 | */ |
||
63 | private $urlsRejected = []; |
||
64 | |||
65 | /** |
||
66 | * @var array |
||
67 | */ |
||
68 | private $urlsReturned = []; |
||
69 | |||
70 | /** |
||
71 | * @var LoggerInterface |
||
72 | */ |
||
73 | private $logger = null; |
||
74 | |||
75 | /** |
||
76 | * @param Client $client |
||
77 | * @param array $options |
||
78 | */ |
||
79 | 13 | public function __construct(Client $client = null, array $options = []) |
|
90 | |||
91 | /** |
||
92 | * @param Client $client |
||
93 | */ |
||
94 | 13 | public function setClient(Client $client) |
|
98 | |||
99 | /** |
||
100 | * @return Client |
||
101 | */ |
||
102 | 2 | public function getClient() |
|
106 | |||
107 | /** |
||
108 | * @param array $options |
||
109 | */ |
||
110 | 13 | public function setOptions(array $options) |
|
111 | { |
||
112 | 13 | if (isset($options['limit'])) { |
|
113 | 2 | $this->setLimit($options['limit']); |
|
114 | 1 | } |
|
115 | 13 | if (isset($options['stop_on_error'])) { |
|
116 | 1 | $this->setStopOnError($options['stop_on_error']); |
|
117 | 1 | } |
|
118 | 13 | if (isset($options['logger'])) { |
|
119 | 1 | $this->setLogger($options['logger']); |
|
120 | 1 | } |
|
121 | 13 | if (isset($options['whitelist_url_matchers'])) { |
|
122 | 1 | $this->setWhitelistUrlMatchers($options['whitelist_url_matchers']); |
|
123 | 1 | } |
|
124 | 13 | if (isset($options['blacklist_url_matchers'])) { |
|
125 | 1 | $this->setBlacklistUrlMatchers($options['blacklist_url_matchers']); |
|
126 | 1 | } |
|
127 | 13 | if (isset($options['url_normalizers'])) { |
|
128 | $this->setUrlNormalizers($options['url_normalizers']); |
||
129 | } |
||
130 | 13 | } |
|
131 | |||
132 | /** |
||
133 | * @return int |
||
134 | */ |
||
135 | 3 | public function getLimit() |
|
139 | |||
140 | /** |
||
141 | * @param int $limit |
||
142 | * @return $this |
||
143 | */ |
||
144 | 3 | public function setLimit($limit) |
|
150 | |||
151 | /** |
||
152 | * @return boolean |
||
153 | */ |
||
154 | 5 | public function getStopOnError() |
|
158 | |||
159 | /** |
||
160 | * @param boolean $stopOnError |
||
161 | * @return Crawler |
||
162 | */ |
||
163 | 3 | public function setStopOnError($stopOnError) |
|
169 | |||
170 | /** |
||
171 | * @return array |
||
172 | */ |
||
173 | 9 | public function getUrlsCrawled() |
|
177 | |||
178 | /** |
||
179 | * @return array |
||
180 | */ |
||
181 | 2 | public function getUrlsQueued() |
|
185 | |||
186 | /** |
||
187 | * @return array |
||
188 | */ |
||
189 | 2 | public function getUrlsRejected() |
|
193 | |||
194 | /** |
||
195 | * @return array |
||
196 | */ |
||
197 | 4 | public function getUrlsReturned() |
|
201 | |||
202 | /** |
||
203 | * @param $urlMatchers |
||
204 | * @return $this |
||
205 | */ |
||
206 | 2 | public function setWhitelistUrlMatchers(array $urlMatchers) |
|
215 | |||
216 | /** |
||
217 | * @return Url\Matcher\UrlMatcherInterface[] |
||
218 | */ |
||
219 | 3 | public function getWhitelistUrlMatchers() |
|
223 | |||
224 | /** |
||
225 | * @param UrlMatcherInterface $urlMatcher |
||
226 | * @return $this |
||
227 | */ |
||
228 | 3 | public function addWhitelistUrlMatcher(UrlMatcherInterface $urlMatcher) |
|
234 | |||
235 | /** |
||
236 | * @return $this |
||
237 | */ |
||
238 | 2 | public function clearWhitelistUrlMatchers() |
|
244 | |||
245 | /** |
||
246 | * @param array $urlMatchers |
||
247 | * @return $this |
||
248 | */ |
||
249 | 2 | public function setBlacklistUrlMatchers(array $urlMatchers) |
|
258 | |||
259 | /** |
||
260 | * @return Url\Matcher\UrlMatcherInterface[] |
||
261 | */ |
||
262 | 3 | public function getBlacklistUrlMatchers() |
|
266 | |||
267 | /** |
||
268 | * @param UrlMatcherInterface $urlMatcher |
||
269 | * @return $this |
||
270 | */ |
||
271 | 3 | public function addBlacklistUrlMatcher(UrlMatcherInterface $urlMatcher) |
|
277 | |||
278 | /** |
||
279 | * @return $this |
||
280 | */ |
||
281 | 2 | public function clearBlacklistUrlMatchers() |
|
287 | |||
288 | /** |
||
289 | * @param array $normalizers |
||
290 | * @return $this |
||
291 | */ |
||
292 | 1 | public function setUrlNormalizers(array $normalizers) |
|
302 | |||
303 | /** |
||
304 | * @return UrlNormalizerInterface[] |
||
305 | */ |
||
306 | 1 | public function getUrlNormalizers() |
|
310 | |||
311 | /** |
||
312 | * @param UrlNormalizerInterface $normalizer |
||
313 | * @return $this |
||
314 | */ |
||
315 | 2 | public function addUrlNormalizer(UrlNormalizerInterface $normalizer) |
|
321 | |||
322 | /** |
||
323 | * @return $this |
||
324 | */ |
||
325 | 1 | public function clearUrlNormalizers() |
|
331 | |||
332 | /** |
||
333 | * @return LoggerInterface |
||
334 | */ |
||
335 | 11 | public function getLogger() |
|
343 | |||
344 | /** |
||
345 | * @param LoggerInterface $logger |
||
346 | * @return $this |
||
347 | */ |
||
348 | 2 | public function setLogger(LoggerInterface $logger) |
|
354 | |||
355 | /** |
||
356 | * @param Url $url |
||
357 | */ |
||
358 | 8 | protected function addUrlToQueue(Url $url) |
|
362 | |||
363 | /** |
||
364 | * @param string $url |
||
365 | * @return Url |
||
366 | */ |
||
367 | 8 | protected function createHttpUrlString($url) |
|
371 | |||
372 | /** |
||
373 | * @param Url $url |
||
374 | */ |
||
375 | 8 | protected function reset(Url $url) |
|
383 | |||
384 | /** |
||
385 | * @param string $url |
||
386 | * @return \Generator |
||
387 | */ |
||
388 | 8 | public function crawl($url) |
|
389 | { |
||
390 | 8 | $url = $this->createHttpUrlString($url); |
|
391 | 8 | $this->reset($url); |
|
392 | |||
393 | 8 | while (count($this->urlsQueued) > 0) { |
|
394 | |||
395 | 8 | $url = array_shift($this->urlsQueued); |
|
396 | |||
397 | try { |
||
398 | 8 | $crawler = $this->requestPage((string)$url); |
|
399 | 8 | } catch (\Exception $e) { |
|
400 | 2 | $this->getLogger()->error(sprintf('Error requesting page %s: %s', $url, $e->getMessage())); |
|
401 | |||
402 | 2 | if ($this->getStopOnError()) { |
|
403 | 1 | return; |
|
404 | } |
||
405 | |||
406 | 1 | continue; |
|
407 | } |
||
408 | |||
409 | 8 | $this->urlsCrawled[] = (string)$url; |
|
410 | 8 | $this->updateQueue($crawler); |
|
411 | |||
412 | 8 | if ($this->shouldReturnUrl($url)) { |
|
413 | 8 | $this->getLogger()->debug(sprintf('Return url "%s"', $url)); |
|
414 | |||
415 | 8 | $this->urlsReturned[] = (string)$url; |
|
416 | |||
417 | 8 | yield new Page($url, $crawler); |
|
418 | 8 | } |
|
419 | |||
420 | 8 | if ($this->isLimitReached()) { |
|
421 | 1 | $this->getLogger()->info(sprintf('Crawl limit of %d was reach', $this->limit)); |
|
422 | |||
423 | 1 | return; |
|
424 | } |
||
425 | 8 | } |
|
426 | 6 | } |
|
427 | |||
428 | /** |
||
429 | * @param DomCrawler $crawler |
||
430 | */ |
||
431 | 8 | protected function updateQueue(DomCrawler $crawler) |
|
449 | |||
450 | /** |
||
451 | * @param Url $url |
||
452 | * @return Url |
||
453 | */ |
||
454 | 7 | protected function normalizeUrl(Url $url) |
|
462 | |||
463 | /** |
||
464 | * @param Url $url |
||
465 | * @return bool |
||
466 | */ |
||
467 | 8 | protected function shouldReturnUrl(Url $url) |
|
485 | |||
486 | /** |
||
487 | * @param Url $url |
||
488 | * @return bool |
||
489 | */ |
||
490 | 1 | protected function isUrlWhitelisted(Url $url) |
|
500 | |||
501 | /** |
||
502 | * @param Url $url |
||
503 | * @return bool |
||
504 | */ |
||
505 | 8 | protected function isUrlBlacklisted(Url $url) |
|
515 | |||
516 | /** |
||
517 | * @param Url $url |
||
518 | * @return bool |
||
519 | */ |
||
520 | 7 | protected function shouldCrawlUrl(Url $url) |
|
534 | |||
535 | /** |
||
536 | * @param Url $url |
||
537 | * @return bool |
||
538 | */ |
||
539 | 7 | protected function isUrlRejected(Url $url) |
|
543 | |||
544 | /** |
||
545 | * @param Url $url |
||
546 | * @return bool |
||
547 | */ |
||
548 | 7 | protected function isUrlCrawled(Url $url) |
|
552 | |||
553 | /** |
||
554 | * @param Url $url |
||
555 | * @return bool |
||
556 | */ |
||
557 | 7 | protected function isUrlQueued(Url $url) |
|
561 | |||
562 | /** |
||
563 | * @param Url $url |
||
564 | * @return bool |
||
565 | */ |
||
566 | 7 | protected function isUrlPartOfBaseUrl(Url $url) |
|
576 | |||
577 | /** |
||
578 | * @return bool |
||
579 | */ |
||
580 | 8 | protected function isLimitReached() |
|
584 | |||
585 | /** |
||
586 | * @param DomCrawler $crawler |
||
587 | * @return array |
||
588 | */ |
||
589 | 8 | protected function extractUrlsFromCrawler(DomCrawler $crawler) |
|
597 | |||
598 | /** |
||
599 | * @param $url |
||
600 | * @return DomCrawler |
||
601 | */ |
||
602 | 8 | protected function requestPage($url) |
|
610 | } |
||
611 |