Complex classes like Crawler often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Crawler, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 20 | class Crawler |
||
| 21 | { |
||
| 22 | /** @var \GuzzleHttp\Client */ |
||
| 23 | protected $client; |
||
| 24 | |||
| 25 | /** @var \Psr\Http\Message\UriInterface */ |
||
| 26 | protected $baseUrl; |
||
| 27 | |||
| 28 | /** @var \Spatie\Crawler\CrawlObserverCollection */ |
||
| 29 | protected $crawlObservers; |
||
| 30 | |||
| 31 | /** @var \Spatie\Crawler\CrawlProfile */ |
||
| 32 | protected $crawlProfile; |
||
| 33 | |||
| 34 | /** @var int */ |
||
| 35 | protected $concurrency; |
||
| 36 | |||
| 37 | /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */ |
||
| 38 | protected $crawlQueue; |
||
| 39 | |||
| 40 | /** @var int */ |
||
| 41 | protected $crawledUrlCount = 0; |
||
| 42 | |||
| 43 | /** @var int|null */ |
||
| 44 | protected $maximumCrawlCount = null; |
||
| 45 | |||
| 46 | /** @var int */ |
||
| 47 | protected $maximumResponseSize = 1024 * 1024 * 2; |
||
| 48 | |||
| 49 | /** @var int|null */ |
||
| 50 | protected $maximumDepth = null; |
||
| 51 | |||
| 52 | /** @var bool */ |
||
| 53 | protected $respectRobots = true; |
||
| 54 | |||
| 55 | /** @var \Tree\Node\Node */ |
||
| 56 | protected $depthTree; |
||
| 57 | |||
| 58 | /** @var bool */ |
||
| 59 | protected $executeJavaScript = false; |
||
| 60 | |||
| 61 | /** @var Browsershot */ |
||
| 62 | protected $browsershot = null; |
||
| 63 | |||
| 64 | /** @var \Spatie\Robots\RobotsTxt */ |
||
| 65 | protected $robotsTxt = null; |
||
| 66 | |||
| 67 | protected static $defaultClientOptions = [ |
||
| 68 | RequestOptions::COOKIES => true, |
||
| 69 | RequestOptions::CONNECT_TIMEOUT => 10, |
||
| 70 | RequestOptions::TIMEOUT => 10, |
||
| 71 | RequestOptions::ALLOW_REDIRECTS => false, |
||
| 72 | ]; |
||
| 73 | |||
| 74 | /** |
||
| 75 | * @param array $clientOptions |
||
| 76 | * |
||
| 77 | * @return static |
||
| 78 | */ |
||
| 79 | public static function create(array $clientOptions = []) |
||
| 80 | { |
||
| 81 | $clientOptions = (count($clientOptions)) |
||
| 82 | ? $clientOptions |
||
| 83 | : static::$defaultClientOptions; |
||
| 84 | |||
| 85 | $client = new Client($clientOptions); |
||
| 86 | |||
| 87 | return new static($client); |
||
| 88 | } |
||
| 89 | |||
| 90 | public function __construct(Client $client, int $concurrency = 10) |
||
| 91 | { |
||
| 92 | $this->client = $client; |
||
| 93 | |||
| 94 | $this->concurrency = $concurrency; |
||
| 95 | |||
| 96 | $this->crawlProfile = new CrawlAllUrls(); |
||
| 97 | |||
| 98 | $this->crawlQueue = new CollectionCrawlQueue(); |
||
| 99 | |||
| 100 | $this->crawlObservers = new CrawlObserverCollection(); |
||
| 101 | } |
||
| 102 | |||
| 103 | public function setConcurrency(int $concurrency): self |
||
| 109 | |||
| 110 | public function setMaximumResponseSize(int $maximumResponseSizeInBytes): self |
||
| 111 | { |
||
| 112 | $this->maximumResponseSize = $maximumResponseSizeInBytes; |
||
| 113 | |||
| 114 | return $this; |
||
| 115 | } |
||
| 116 | |||
| 117 | public function getMaximumResponseSize(): ?int |
||
| 118 | { |
||
| 119 | return $this->maximumResponseSize; |
||
| 120 | } |
||
| 121 | |||
| 122 | public function setMaximumCrawlCount(int $maximumCrawlCount): self |
||
| 123 | { |
||
| 124 | $this->maximumCrawlCount = $maximumCrawlCount; |
||
| 125 | |||
| 126 | return $this; |
||
| 127 | } |
||
| 128 | |||
| 129 | public function getMaximumCrawlCount(): ?int |
||
| 130 | { |
||
| 131 | return $this->maximumCrawlCount; |
||
| 132 | } |
||
| 133 | |||
| 134 | public function getCrawlerUrlCount(): int |
||
| 135 | { |
||
| 136 | return $this->crawledUrlCount; |
||
| 137 | } |
||
| 138 | |||
| 139 | public function setMaximumDepth(int $maximumDepth): self |
||
| 140 | { |
||
| 141 | $this->maximumDepth = $maximumDepth; |
||
| 142 | |||
| 143 | return $this; |
||
| 144 | } |
||
| 145 | |||
| 146 | public function getMaximumDepth(): ?int |
||
| 147 | { |
||
| 148 | return $this->maximumDepth; |
||
| 149 | } |
||
| 150 | |||
| 151 | public function ignoreRobots(): self |
||
| 152 | { |
||
| 153 | $this->respectRobots = false; |
||
| 154 | |||
| 155 | return $this; |
||
| 156 | } |
||
| 157 | |||
| 158 | public function respectRobots(): self |
||
| 159 | { |
||
| 160 | $this->respectRobots = true; |
||
| 161 | |||
| 162 | return $this; |
||
| 163 | } |
||
| 164 | |||
| 165 | public function mustRespectRobots(): bool |
||
| 166 | { |
||
| 167 | return $this->respectRobots; |
||
| 168 | } |
||
| 169 | |||
| 170 | public function getRobotsTxt(): RobotsTxt |
||
| 171 | { |
||
| 172 | return $this->robotsTxt; |
||
| 173 | } |
||
| 174 | |||
| 175 | public function setCrawlQueue(CrawlQueue $crawlQueue): self |
||
| 181 | |||
| 182 | public function getCrawlQueue(): CrawlQueue |
||
| 183 | { |
||
| 184 | return $this->crawlQueue; |
||
| 185 | } |
||
| 186 | |||
| 187 | public function executeJavaScript(): self |
||
| 188 | { |
||
| 189 | $this->executeJavaScript = true; |
||
| 190 | |||
| 191 | return $this; |
||
| 192 | } |
||
| 193 | |||
| 194 | public function doNotExecuteJavaScript(): self |
||
| 200 | |||
| 201 | public function mayExecuteJavascript(): bool |
||
| 202 | { |
||
| 203 | return $this->executeJavaScript; |
||
| 204 | } |
||
| 205 | |||
| 206 | /** |
||
| 207 | * @param \Spatie\Crawler\CrawlObserver|array[\Spatie\Crawler\CrawlObserver] $crawlObservers |
||
|
|
|||
| 208 | * |
||
| 209 | * @return $this |
||
| 210 | */ |
||
| 211 | public function setCrawlObserver($crawlObservers): self |
||
| 212 | { |
||
| 213 | if (! is_array($crawlObservers)) { |
||
| 214 | $crawlObservers = [$crawlObservers]; |
||
| 215 | } |
||
| 219 | |||
| 220 | public function setCrawlObservers(array $crawlObservers): self |
||
| 226 | |||
| 227 | public function addCrawlObserver(CrawlObserver $crawlObserver): self |
||
| 233 | |||
| 234 | public function getCrawlObservers(): CrawlObserverCollection |
||
| 238 | |||
| 239 | public function setCrawlProfile(CrawlProfile $crawlProfile): self |
||
| 245 | |||
| 246 | public function getCrawlProfile(): CrawlProfile |
||
| 250 | |||
| 251 | public function setBrowsershot(Browsershot $browsershot) |
||
| 257 | |||
| 258 | public function getBrowsershot(): Browsershot |
||
| 266 | |||
| 267 | public function getBaseUrl(): UriInterface |
||
| 271 | |||
| 272 | /** |
||
| 273 | * @param \Psr\Http\Message\UriInterface|string $baseUrl |
||
| 274 | */ |
||
| 275 | public function startCrawling($baseUrl) |
||
| 309 | |||
| 310 | public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node |
||
| 334 | |||
| 335 | protected function startCrawlingQueue() |
||
| 350 | |||
| 351 | /** |
||
| 352 | * @deprecated This function will be removed in the next major version |
||
| 353 | */ |
||
| 354 | public function endsWith($haystack, $needle) |
||
| 359 | |||
| 360 | protected function createRobotsTxt(UriInterface $uri): RobotsTxt |
||
| 364 | |||
| 365 | protected function getCrawlRequests(): Generator |
||
| 386 | |||
| 387 | public function addToCrawlQueue(CrawlUrl $crawlUrl): self |
||
| 403 | |||
| 404 | public function maximumCrawlCountReached(): bool |
||
| 414 | } |
||
| 415 |
This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.