1 | <?php |
||
19 | class Crawler |
||
20 | { |
||
21 | use CrawlerProperties; |
||
22 | |||
23 | /** @var \GuzzleHttp\Client */ |
||
24 | protected $client; |
||
25 | |||
26 | /** @var \Psr\Http\Message\UriInterface */ |
||
27 | protected $baseUrl; |
||
28 | |||
29 | /** @var \Spatie\Crawler\ObserverCollection */ |
||
30 | protected $crawlObservers; |
||
31 | |||
32 | /** @var \Spatie\Crawler\CrawlProfile */ |
||
33 | protected $crawlProfile; |
||
34 | |||
35 | /** @var int */ |
||
36 | protected $concurrency; |
||
37 | |||
38 | /** @var \Spatie\Crawler\CrawlQueue\CrawlQueue */ |
||
39 | protected $crawlQueue; |
||
40 | |||
41 | /** @var int */ |
||
42 | protected $crawledUrlCount = 0; |
||
43 | |||
44 | /** @var int|null */ |
||
45 | protected $maximumCrawlCount = null; |
||
46 | |||
47 | /** @var int */ |
||
48 | protected $maximumResponseSize = 1024 * 1024 * 2; |
||
49 | |||
50 | /** @var int|null */ |
||
51 | protected $maximumDepth = null; |
||
52 | |||
53 | /** @var bool */ |
||
54 | protected $respectRobots = true; |
||
55 | |||
56 | /** @var \Tree\Node\Node */ |
||
57 | protected $depthTree; |
||
58 | |||
59 | /** @var bool */ |
||
60 | protected $executeJavaScript = false; |
||
61 | |||
62 | /** @var Browsershot */ |
||
63 | protected $browsershot = null; |
||
64 | |||
65 | /** @var \Spatie\Robots\RobotsTxt */ |
||
66 | protected $robotsTxt = null; |
||
67 | |||
68 | protected static $defaultClientOptions = [ |
||
69 | RequestOptions::COOKIES => true, |
||
70 | RequestOptions::CONNECT_TIMEOUT => 10, |
||
71 | RequestOptions::TIMEOUT => 10, |
||
72 | RequestOptions::ALLOW_REDIRECTS => false, |
||
73 | ]; |
||
74 | |||
75 | /** |
||
76 | * @param array $clientOptions |
||
77 | * |
||
78 | * @return static |
||
79 | */ |
||
80 | public static function create(array $clientOptions = []) |
||
90 | |||
91 | public function __construct(Client $client, int $concurrency = 10) |
||
103 | |||
104 | /** |
||
105 | * @param \Psr\Http\Message\UriInterface|string $baseUrl |
||
106 | */ |
||
107 | public function startCrawling($baseUrl) |
||
139 | |||
140 | public function addToDepthTree(UriInterface $url, UriInterface $parentUrl, Node $node = null): ?Node |
||
164 | |||
165 | protected function startCrawlingQueue() |
||
180 | |||
181 | /** |
||
182 | * @deprecated This function will be removed in the next major version |
||
183 | */ |
||
184 | public function endsWith($haystack, $needle) |
||
189 | |||
190 | protected function createRobotsTxt(UriInterface $uri): RobotsTxt |
||
194 | |||
195 | protected function getCrawlRequests(): Generator |
||
216 | |||
217 | public function addToCrawlQueue(CrawlUrl $crawlUrl): self |
||
233 | |||
234 | public function maximumCrawlCountReached(): bool |
||
244 | } |
||
245 |