1 | <?php |
||
2 | |||
3 | declare(strict_types=1); |
||
4 | |||
5 | use Symfony\Component\DomCrawler\Crawler as DomCrawler; |
||
6 | |||
7 | require_once '../vendor/autoload.php'; |
||
8 | |||
9 | $crawler = new \Radowoj\Crawla\Crawler( |
||
10 | 'https://github.com/radowoj', |
||
11 | new \Symfony\Component\HttpClient\CurlHttpClient(), |
||
12 | ); |
||
13 | |||
14 | $dataGathered = []; |
||
15 | |||
16 | //configure our crawler |
||
17 | //first - set CSS selector for links that should be visited |
||
18 | $crawler->setLinkSelector('.pinned-item-list-item-content a.Link') |
||
19 | //second - define what should be done, when a page was visited? |
||
20 | ->setPageVisitedCallback(function (DomCrawler $domCrawler) use (&$dataGathered) { |
||
21 | //callback will be called for every visited page, including the base url, so let's ensure that |
||
22 | //repo data will be gathered only on repo pages |
||
23 | if (!preg_match('/radowoj\/\w+/', $domCrawler->getUri())) { |
||
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||
24 | return; |
||
25 | } |
||
26 | |||
27 | $readme = $domCrawler->filter('article.markdown-body'); |
||
28 | |||
29 | $dataGathered[] = [ |
||
30 | 'title' => trim($domCrawler->filter('p.f4.my-3')->text()), |
||
31 | 'readme' => $readme->count() ? trim($readme->text()) : '', |
||
32 | ]; |
||
33 | }); |
||
34 | |||
35 | //now crawl, following up to 1 links deep from the entry point |
||
36 | $crawler->crawl(1); |
||
37 | |||
38 | var_dump($dataGathered); |
||
39 | |||
40 | var_dump($crawler->getVisited()->all()); |
||
41 |