Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
13 | class Crawler implements LoggerAwareInterface |
||
14 | { |
||
15 | /** |
||
16 | * @var Client |
||
17 | */ |
||
18 | private $client; |
||
19 | |||
20 | /** |
||
21 | * @var int |
||
22 | */ |
||
23 | private $limit = 0; |
||
24 | |||
25 | /** |
||
26 | * @var bool |
||
27 | */ |
||
28 | private $stopOnError = false; |
||
29 | |||
30 | /** |
||
31 | * @var UrlMatcherInterface[] |
||
32 | */ |
||
33 | private $whitelistUrlMatchers = []; |
||
34 | |||
35 | /** |
||
36 | * @var UrlMatcherInterface[] |
||
37 | */ |
||
38 | private $blacklistUrlMatchers = []; |
||
39 | |||
40 | /** |
||
41 | * @var UrlNormalizerInterface[] |
||
42 | */ |
||
43 | private $urlNormalizers = []; |
||
44 | |||
45 | /** |
||
46 | * @var Url |
||
47 | */ |
||
48 | private $baseUrl; |
||
49 | |||
50 | /** |
||
51 | * @var array |
||
52 | */ |
||
53 | private $urlsCrawled = []; |
||
54 | |||
55 | /** |
||
56 | * @var array |
||
57 | */ |
||
58 | private $urlsQueued = []; |
||
59 | |||
60 | /** |
||
61 | * @var array |
||
62 | */ |
||
63 | private $urlsRejected = []; |
||
64 | |||
65 | /** |
||
66 | * @var array |
||
67 | */ |
||
68 | private $urlsReturned = []; |
||
69 | |||
70 | /** |
||
71 | * @var LoggerInterface |
||
72 | */ |
||
73 | private $logger = null; |
||
74 | |||
75 | /** |
||
76 | * @param Client $client |
||
77 | * @param array $options |
||
78 | */ |
||
79 | 9 | public function __construct(Client $client = null, array $options = []) |
|
90 | |||
91 | /** |
||
92 | * @param Client $client |
||
93 | */ |
||
94 | 9 | public function setClient(Client $client) |
|
98 | |||
99 | /** |
||
100 | * @return Client |
||
101 | */ |
||
102 | 2 | public function getClient() |
|
106 | |||
107 | /** |
||
108 | * @param array $options |
||
109 | */ |
||
110 | 9 | public function setOptions(array $options) |
|
128 | |||
129 | /** |
||
130 | * @return int |
||
131 | */ |
||
132 | 3 | public function getLimit() |
|
136 | |||
137 | /** |
||
138 | * @param int $limit |
||
139 | * @return $this |
||
140 | */ |
||
141 | 3 | public function setLimit($limit) |
|
147 | |||
148 | /** |
||
149 | * @return boolean |
||
150 | */ |
||
151 | 3 | public function getStopOnError() |
|
155 | |||
156 | /** |
||
157 | * @param boolean $stopOnError |
||
158 | * @return Crawler |
||
159 | */ |
||
160 | 2 | public function setStopOnError($stopOnError) |
|
166 | |||
167 | /** |
||
168 | * @return array |
||
169 | */ |
||
170 | 6 | public function getUrlsCrawled() |
|
174 | |||
175 | /** |
||
176 | * @return array |
||
177 | */ |
||
178 | 2 | public function getUrlsQueued() |
|
182 | |||
183 | /** |
||
184 | * @return array |
||
185 | */ |
||
186 | 2 | public function getUrlsRejected() |
|
190 | |||
191 | /** |
||
192 | * @return array |
||
193 | */ |
||
194 | 4 | public function getUrlsReturned() |
|
198 | |||
199 | /** |
||
200 | * @param $urlMatchers |
||
201 | * @return $this |
||
202 | */ |
||
203 | 2 | public function setWhitelistUrlMatchers(array $urlMatchers) |
|
212 | |||
213 | /** |
||
214 | * @return Url\Matcher\UrlMatcherInterface[] |
||
215 | */ |
||
216 | 3 | public function getWhitelistUrlMatchers() |
|
220 | |||
221 | /** |
||
222 | * @param UrlMatcherInterface $urlMatcher |
||
223 | * @return $this |
||
224 | */ |
||
225 | 3 | public function addWhitelistUrlMatcher(UrlMatcherInterface $urlMatcher) |
|
231 | |||
232 | /** |
||
233 | * @return $this |
||
234 | */ |
||
235 | 2 | public function clearWhitelistUrlMatchers() |
|
241 | |||
242 | /** |
||
243 | * @param array $urlMatchers |
||
244 | * @return $this |
||
245 | */ |
||
246 | 2 | public function setBlacklistUrlMatchers(array $urlMatchers) |
|
255 | |||
256 | /** |
||
257 | * @return Url\Matcher\UrlMatcherInterface[] |
||
258 | */ |
||
259 | 3 | public function getBlacklistUrlMatchers() |
|
263 | |||
264 | /** |
||
265 | * @param UrlMatcherInterface $urlMatcher |
||
266 | * @return $this |
||
267 | */ |
||
268 | 3 | public function addBlacklistUrlMatcher(UrlMatcherInterface $urlMatcher) |
|
274 | |||
275 | /** |
||
276 | * @return $this |
||
277 | */ |
||
278 | 2 | public function clearBlacklistUrlMatchers() |
|
284 | |||
285 | /** |
||
286 | * @param array $normalizers |
||
287 | * @return $this |
||
288 | */ |
||
289 | public function setUrlNormalizers(array $normalizers) |
||
299 | |||
300 | /** |
||
301 | * @param UrlNormalizerInterface $normalizer |
||
302 | * @return $this |
||
303 | */ |
||
304 | public function addUrlNormalizer(UrlNormalizerInterface $normalizer) |
||
310 | |||
311 | /** |
||
312 | * @return $this |
||
313 | */ |
||
314 | public function clearUrlNormalizers() |
||
320 | |||
321 | /** |
||
322 | * @return LoggerInterface |
||
323 | */ |
||
324 | 8 | public function getLogger() |
|
332 | |||
333 | /** |
||
334 | * @param LoggerInterface $logger |
||
335 | * @return $this |
||
336 | */ |
||
337 | 2 | public function setLogger(LoggerInterface $logger) |
|
343 | |||
344 | /** |
||
345 | * @param Url $url |
||
346 | */ |
||
347 | 5 | protected function addUrlToQueue(Url $url) |
|
351 | |||
352 | /** |
||
353 | * @param $url |
||
354 | * @return Url |
||
355 | */ |
||
356 | 5 | protected function createHttpUrlString($url) |
|
360 | |||
361 | /** |
||
362 | * @param Url $url |
||
363 | */ |
||
364 | 5 | protected function reset(Url $url) |
|
372 | |||
373 | /** |
||
374 | * @param string $url |
||
375 | * @return \Generator|void |
||
376 | */ |
||
377 | 5 | public function crawl($url) |
|
378 | { |
||
379 | 5 | $url = $this->createHttpUrlString($url); |
|
380 | 5 | $this->reset($url); |
|
381 | |||
382 | 5 | while (count($this->urlsQueued) > 0) { |
|
383 | |||
384 | 5 | $url = array_shift($this->urlsQueued); |
|
385 | |||
386 | try { |
||
387 | 5 | $crawler = $this->requestPage((string)$url); |
|
388 | 5 | } catch (\Exception $e) { |
|
389 | $this->getLogger()->error(sprintf('Error requesting page %s: %s', $url, $e->getMessage())); |
||
390 | |||
391 | if ($this->getStopOnError()) { |
||
392 | return; |
||
393 | } |
||
394 | |||
395 | continue; |
||
396 | } |
||
397 | |||
398 | 5 | $this->urlsCrawled[] = (string)$url; |
|
399 | 5 | $this->updateQueue($crawler); |
|
400 | |||
401 | 5 | if ($this->shouldReturnUrl($url)) { |
|
402 | 5 | $this->getLogger()->debug(sprintf('Return url "%s"', $url)); |
|
403 | |||
404 | 5 | $this->urlsReturned[] = (string)$url; |
|
405 | |||
406 | 5 | yield new Page($url, $crawler); |
|
407 | 5 | } |
|
408 | |||
409 | 5 | if ($this->isLimitReached()) { |
|
410 | 1 | $this->getLogger()->info(sprintf('Crawl limit of %d was reach', $this->limit)); |
|
411 | |||
412 | 1 | return; |
|
413 | } |
||
414 | 5 | } |
|
415 | 4 | } |
|
416 | |||
417 | /** |
||
418 | * @param DomCrawler $crawler |
||
419 | */ |
||
420 | 5 | protected function updateQueue(DomCrawler $crawler) |
|
421 | { |
||
422 | 5 | foreach ($this->extractUrlsFromCrawler($crawler) as $url) { |
|
423 | 4 | if (!in_array($url, $this->urlsRejected)) { |
|
424 | 4 | $this->getLogger()->debug(sprintf('Found url %s in page', $url)); |
|
425 | try { |
||
426 | 4 | $url = $this->normalizeUrl($this->createHttpUrlString($url)); |
|
427 | |||
428 | 4 | if ($this->shouldCrawlUrl($url)) { |
|
429 | 4 | $this->addUrlToQueue($url); |
|
430 | 4 | } |
|
431 | 4 | } catch (\Exception $e) { |
|
432 | 4 | $this->getLogger()->warning( |
|
433 | 4 | sprintf('Url %s could not be converted to an object: %s', $url, $e->getMessage()) |
|
434 | 4 | ); |
|
435 | 4 | $this->urlsRejected[] = $url; |
|
436 | } |
||
437 | 4 | } |
|
438 | 5 | } |
|
439 | 5 | } |
|
440 | |||
441 | /** |
||
442 | * @param Url $url |
||
443 | * @return Url |
||
444 | */ |
||
445 | 4 | protected function normalizeUrl(Url $url) |
|
453 | |||
454 | /** |
||
455 | * @param Url $url |
||
456 | * @return bool |
||
457 | */ |
||
458 | 5 | protected function shouldReturnUrl(Url $url) |
|
459 | { |
||
460 | 5 | if (!empty($this->whitelistUrlMatchers)) { |
|
461 | 1 | foreach ($this->whitelistUrlMatchers as $matcher) { |
|
462 | 1 | if ($matcher->matches($url)) { |
|
463 | 1 | return true; |
|
464 | } |
||
465 | 1 | } |
|
466 | 1 | $this->getLogger()->info(sprintf('Skipped "%s" because it is not whitelisted', $url)); |
|
467 | |||
468 | 1 | return false; |
|
469 | } |
||
470 | |||
471 | 4 | foreach ($this->blacklistUrlMatchers as $matcher) { |
|
472 | 1 | if ($matcher->matches($url)) { |
|
473 | 1 | $this->getLogger()->info(sprintf('Skipped "%s" because it is blacklisted', $url)); |
|
474 | |||
475 | 1 | return false; |
|
476 | } |
||
477 | 4 | } |
|
478 | |||
479 | 4 | return true; |
|
480 | } |
||
481 | |||
482 | /** |
||
483 | * @param Url $url |
||
484 | * @return bool |
||
485 | */ |
||
486 | 4 | protected function shouldCrawlUrl(Url $url) |
|
487 | { |
||
488 | 4 | $urlString = (string)$url; |
|
489 | 4 | if (in_array($urlString, $this->urlsRejected)) { |
|
490 | return false; |
||
491 | } |
||
492 | 4 | if (in_array($urlString, $this->urlsCrawled)) { |
|
493 | 3 | return false; |
|
494 | } |
||
495 | 4 | if (isset($this->urlsQueued[$urlString])) { |
|
496 | return false; |
||
497 | } |
||
498 | |||
499 | 4 | if (!$this->isUrlPartOfBaseUrl($url)) { |
|
500 | 4 | $this->urlsRejected[] = (string)$url; |
|
501 | 4 | return false; |
|
502 | } |
||
503 | |||
504 | 4 | return true; |
|
505 | } |
||
506 | |||
507 | /** |
||
508 | * @param Url $url |
||
509 | * @return bool |
||
510 | */ |
||
511 | 4 | protected function isUrlPartOfBaseUrl(Url $url) |
|
512 | { |
||
513 | 4 | $baseUrlString = (string)$this->baseUrl; |
|
514 | 4 | $this->getLogger()->debug($baseUrlString.' - '.$url); |
|
515 | 4 | if (strpos((string)$url, $baseUrlString) === false) { |
|
516 | 4 | return false; |
|
517 | } |
||
518 | |||
519 | 4 | return true; |
|
520 | } |
||
521 | |||
522 | /** |
||
523 | * @return bool |
||
524 | */ |
||
525 | 5 | private function isLimitReached() |
|
529 | |||
530 | /** |
||
531 | * @param DomCrawler $crawler |
||
532 | * @return array |
||
533 | */ |
||
534 | 5 | protected function extractUrlsFromCrawler(DomCrawler $crawler) |
|
542 | |||
543 | /** |
||
544 | * @param $url |
||
545 | * @return DomCrawler |
||
546 | */ |
||
547 | 5 | protected function requestPage($url) |
|
555 | } |
||
556 |