| 1 |  |  | <?php | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | namespace Spatie\Crawler\Handlers; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  | use Spatie\Crawler\Crawler; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  | use Spatie\Crawler\CrawlUrl; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  | use Spatie\Crawler\LinkAdder; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  | use Spatie\Crawler\CrawlerRobots; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  | use Psr\Http\Message\UriInterface; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  | use Spatie\Crawler\CrawlSubdomains; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  | use Psr\Http\Message\StreamInterface; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  | use Psr\Http\Message\ResponseInterface; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  | use function GuzzleHttp\Psr7\stream_for; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  | class CrawlRequestFulfilled | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  | { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  |     /** @var \Spatie\Crawler\Crawler */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  |     protected $crawler; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  |     /** @var \Spatie\Crawler\LinkAdder */ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  |     protected $linkAdder; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  |     public function __construct(Crawler $crawler) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  |         $this->crawler = $crawler; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  |         $this->linkAdder = new LinkAdder($this->crawler); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |     public function __invoke(ResponseInterface $response, $index) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  |         $robots = new CrawlerRobots($response, $this->crawler->mustRespectRobots()); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  |         if (! $robots->mayIndex()) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |             return; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 |  |  |         $crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  |         if ($this->crawler->mayExecuteJavaScript()) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  |             $html = $this->getBodyAfterExecutingJavaScript($crawlUrl->url); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |             $response = $response->withBody(stream_for($html)); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 44 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 |  |  |         $this->handleCrawled($response, $crawlUrl); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 |  |  |         if (! $this->crawler->getCrawlProfile() instanceof CrawlSubdomains) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  |             if ($crawlUrl->url->getHost() !== $this->crawler->getBaseUrl()->getHost()) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 |  |  |                 return; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  |             } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  |         if (! $robots->mayFollow()) { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  |             return; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 56 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 57 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 58 |  |  |         $body = $this->convertBodyToString($response->getBody(), $this->crawler->getMaximumResponseSize()); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 59 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 60 |  |  |         $this->linkAdder->addFromHtml($body, $crawlUrl->url); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 61 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 62 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 63 |  |  |     protected function handleCrawled(ResponseInterface $response, CrawlUrl $crawlUrl) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 64 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 65 |  |  |         $this->crawler->getCrawlObservers()->crawled($crawlUrl, $response); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 66 |  |  |     } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 67 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 68 |  |  |     protected function convertBodyToString(StreamInterface $bodyStream, $readMaximumBytes = 1024 * 1024 * 2): string | 
            
                                                                                                            
                            
            
                                    
            
            
                | 69 |  |  |     { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 70 |  |  |         $bodyStream->rewind(); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |         $body = $bodyStream->read($readMaximumBytes); | 
            
                                                                                                            
                            
            
                                    
            
            
                | 73 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 74 |  |  |         return $body; | 
            
                                                                                                            
                            
            
                                    
            
            
                | 75 |  |  |     } | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 76 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 77 |  |  |     protected function getBodyAfterExecutingJavaScript(UriInterface $url): string | 
            
                                                        
            
                                    
            
            
                | 78 |  |  |     { | 
            
                                                        
            
                                    
            
            
                | 79 |  |  |         $browsershot = $this->crawler->getBrowsershot(); | 
            
                                                        
            
                                    
            
            
                | 80 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 81 |  |  |         $html = $browsershot->setUrl((string) $url)->bodyHtml(); | 
            
                                                        
            
                                    
            
            
                | 82 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 83 |  |  |         return html_entity_decode($html); | 
            
                                                        
            
                                    
            
            
                | 84 |  |  |     } | 
            
                                                        
            
                                    
            
            
                | 85 |  |  | } | 
            
                                                        
            
                                    
            
            
                | 86 |  |  |  |