Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 22 | class Crawler |
||
| 23 | { |
||
| 24 | use CrawlerProperties; |
||
| 25 | |||
| 26 | /** @var \GuzzleHttp\Client */ |
||
| 27 | protected $client; |
||
| 28 | |||
| 29 | /** @var \Psr\Http\Message\UriInterface */ |
||
| 30 | protected $baseUrl; |
||
| 31 | |||
| 32 | /** @var array[\Spatie\Crawler\CrawlObserver] */ |
||
| 33 | protected $crawlObservers; |
||
| 34 | |||
| 35 | /** @var \Spatie\Crawler\CrawlProfile */ |
||
| 36 | protected $crawlProfile; |
||
| 37 | |||
| 38 | /** @var int */ |
||
| 39 | protected $concurrency; |
||
| 40 | |||
| 41 | /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */ |
||
| 42 | protected $crawlQueue; |
||
| 43 | |||
| 44 | /** @var int */ |
||
| 45 | protected $crawledUrlCount = 0; |
||
| 46 | |||
| 47 | /** @var int|null */ |
||
| 48 | protected $maximumCrawlCount = null; |
||
| 49 | |||
| 50 | /** @var int */ |
||
| 51 | protected $maximumResponseSize = 1024 * 1024 * 2; |
||
| 52 | |||
| 53 | /** @var int|null */ |
||
| 54 | protected $maximumDepth = null; |
||
| 55 | |||
| 56 | /** @var bool */ |
||
| 57 | protected $respectRobots = true; |
||
| 58 | |||
| 59 | /** @var \Tree\Node\Node */ |
||
| 60 | protected $depthTree; |
||
| 61 | |||
| 62 | /** @var bool */ |
||
| 63 | protected $executeJavaScript = false; |
||
| 64 | |||
| 65 | /** @var Browsershot */ |
||
| 66 | protected $browsershot = null; |
||
| 67 | |||
| 68 | /** @var \Spatie\Robots\RobotsTxt */ |
||
| 69 | protected $robotsTxt = null; |
||
| 70 | |||
| 71 | protected static $defaultClientOptions = [ |
||
| 72 | RequestOptions::COOKIES => true, |
||
| 73 | RequestOptions::CONNECT_TIMEOUT => 10, |
||
| 74 | RequestOptions::TIMEOUT => 10, |
||
| 75 | RequestOptions::ALLOW_REDIRECTS => false, |
||
| 76 | ]; |
||
| 77 | |||
| 78 | /** |
||
| 79 | * @param array $clientOptions |
||
| 80 | * |
||
| 81 | * @return static |
||
| 82 | */ |
||
| 83 | public static function create(array $clientOptions = []) |
||
| 93 | |||
| 94 | public function __construct(Client $client, int $concurrency = 10) |
||
| 104 | |||
| 105 | /** |
||
| 106 | * @param \Psr\Http\Message\UriInterface|string $baseUrl |
||
| 107 | */ |
||
| 108 | public function startCrawling($baseUrl) |
||
| 140 | |||
| 141 | protected function startCrawlingQueue() |
||
| 156 | |||
| 157 | public function endsWith($haystack, $needle) |
||
| 162 | |||
| 163 | protected function createRobotsTxt(UriInterface $uri): RobotsTxt |
||
| 167 | |||
| 168 | protected function getCrawlRequests(): Generator |
||
| 189 | |||
| 190 | public function addAllLinksToCrawlQueue(string $html, UriInterface $foundOnUrl) |
||
| 227 | |||
| 228 | protected function shouldCrawl(Node $node): bool |
||
| 240 | |||
| 241 | /** |
||
| 242 | * @param string $html |
||
| 243 | * @param \Psr\Http\Message\UriInterface $foundOnUrl |
||
| 244 | * |
||
| 245 | * @return \Illuminate\Support\Collection|\Tightenco\Collect\Support\Collection|null |
||
| 246 | */ |
||
| 247 | protected function extractAllLinks(string $html, UriInterface $foundOnUrl) |
||
| 268 | |||
| 269 | protected function normalizeUrl(UriInterface $url): UriInterface |
||
| 273 | |||
| 274 | protected function hasCrawlableScheme(UriInterface $uri): bool |
||
| 278 | |||
| 279 | protected function addToDepthTree(Node $node, UriInterface $url, UriInterface $parentUrl) |
||
| 301 | |||
| 302 | protected function getBodyAfterExecutingJavaScript(UriInterface $foundOnUrl): string |
||
| 310 | |||
| 311 | protected function getBrowsershot(): Browsershot |
||
| 319 | |||
| 320 | public function setBrowsershot(Browsershot $browsershot) |
||
| 326 | |||
| 327 | protected function addToCrawlQueue(CrawlUrl $crawlUrl) |
||
| 335 | |||
| 336 | protected function maximumCrawlCountReached(): bool |
||
| 344 | } |
||
| 345 |