1 | <?php |
||
14 | class Client |
||
15 | { |
||
16 | |||
17 | protected $scrapClient; |
||
18 | protected $adaptersList = ['Microdata', 'HAtom', 'OpenGraph', 'JsonLD', 'Parsely', 'Default']; |
||
19 | |||
20 | /** |
||
21 | * Adapter to scrap content |
||
22 | * @var Adapters\AbstractAdapter |
||
23 | */ |
||
24 | protected $adapter; |
||
25 | |||
26 | /** |
||
27 | * Constructor |
||
28 | */ |
||
29 | public function __construct($adapter_name = null, CookieJar $cookie_jar = null) |
||
47 | |||
48 | /** |
||
49 | * Getting selected adapter |
||
50 | * @return Adapters\AbstractAdapter |
||
51 | */ |
||
52 | public function getAdapter() |
||
56 | |||
57 | /** |
||
58 | * Setting adapter preferred for scrapping |
||
59 | * @param string $adapter_name |
||
60 | * @throws \Exception |
||
61 | */ |
||
62 | public function setAdapter($adapter_name) |
||
73 | |||
74 | /** |
||
75 | * scrap one source of news |
||
76 | * @param string $baseUrl url to scrap list of news from |
||
77 | * @param string $linkSelector css selector for news links in page |
||
78 | * @param int|NULL $limit limit of news article to scrap, |
||
79 | * if not set it will scrap all matching the selector |
||
80 | * @return array array of article items scrapped |
||
81 | */ |
||
82 | public function scrapLinkGroup($baseUrl, $linkSelector, $limit = null) |
||
83 | { |
||
84 | $crawler = $this->scrapClient->request('GET', $baseUrl); |
||
85 | |||
86 | $scrap_result = array(); |
||
87 | $theAdapter = new Adapters\DefaultAdapter(); |
||
88 | $theAdapter->currentUrl = $baseUrl; |
||
89 | |||
90 | $isXpath = Selector::isXPath($linkSelector); |
||
91 | $method = ($isXpath === false) ? 'filter' : 'filterXPath'; |
||
92 | |||
93 | $crawler->$method($linkSelector) |
||
94 | ->each( |
||
95 | function(Crawler $link_node) use (&$scrap_result, $theAdapter, &$limit) { |
||
96 | if (!is_null($limit) && count($scrap_result) >= $limit) { |
||
97 | return; |
||
98 | } |
||
99 | $link = $theAdapter |
||
100 | ->normalizeLink($link_node->attr('href'), true); //remove hash before scrapping |
||
101 | |||
102 | $article_info = $this->getLinkData($link); |
||
103 | $this->setAdapter(''); //reset default adapter after scrapping one link |
||
104 | $scrap_result[$link] = $article_info; |
||
105 | } |
||
106 | ); |
||
107 | |||
108 | return $scrap_result; |
||
109 | } |
||
110 | |||
111 | /** |
||
112 | * Scrap information for single url |
||
113 | * @param string $link |
||
114 | * @return \stdClass |
||
115 | */ |
||
116 | public function getLinkData($link) |
||
136 | |||
137 | /** |
||
138 | * Extracting page data from domCrawler according to rules defined by adapter |
||
139 | * @param \stdClass $article_info |
||
140 | * @param Crawler $pageCrawler |
||
141 | * @param \Zrashwani\NewsScrapper\Adapters\AbstractAdapter $adapter adapter used for scrapping |
||
142 | */ |
||
143 | protected function extractPageData( |
||
167 | } |
||
168 |
Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code: