radowoj /
crawla
| 1 | <?php |
||
| 2 | |||
| 3 | declare(strict_types=1); |
||
| 4 | |||
| 5 | use Symfony\Component\DomCrawler\Crawler as DomCrawler; |
||
| 6 | |||
| 7 | require_once '../vendor/autoload.php'; |
||
| 8 | |||
| 9 | $crawler = new \Radowoj\Crawla\Crawler( |
||
| 10 | 'https://github.com/radowoj', |
||
| 11 | new \Symfony\Component\HttpClient\CurlHttpClient(), |
||
| 12 | ); |
||
| 13 | |||
| 14 | $dataGathered = []; |
||
| 15 | |||
| 16 | //configure our crawler |
||
| 17 | //first - set CSS selector for links that should be visited |
||
| 18 | $crawler->setLinkSelector('.pinned-item-list-item-content a.Link') |
||
| 19 | //second - define what should be done, when a page was visited? |
||
| 20 | ->setPageVisitedCallback(function (DomCrawler $domCrawler) use (&$dataGathered) { |
||
| 21 | //callback will be called for every visited page, including the base url, so let's ensure that |
||
| 22 | //repo data will be gathered only on repo pages |
||
| 23 | if (!preg_match('/radowoj\/\w+/', $domCrawler->getUri())) { |
||
|
0 ignored issues
–
show
Bug
introduced
by
Loading history...
|
|||
| 24 | return; |
||
| 25 | } |
||
| 26 | |||
| 27 | $readme = $domCrawler->filter('article.markdown-body'); |
||
| 28 | |||
| 29 | $dataGathered[] = [ |
||
| 30 | 'title' => trim($domCrawler->filter('p.f4.my-3')->text()), |
||
| 31 | 'readme' => $readme->count() ? trim($readme->text()) : '', |
||
| 32 | ]; |
||
| 33 | }); |
||
| 34 | |||
| 35 | //now crawl, following up to 1 links deep from the entry point |
||
| 36 | $crawler->crawl(1); |
||
| 37 | |||
| 38 | var_dump($dataGathered); |
||
| 39 | |||
| 40 | var_dump($crawler->getVisited()->all()); |
||
| 41 |