This project does not seem to handle request data directly as such no vulnerable execution paths were found.
include
, or for example
via PHP's auto-loading mechanism.
These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | |||
3 | namespace Zrashwani\NewsScrapper; |
||
4 | |||
5 | use Goutte\Client as GoutteClient; |
||
6 | use Symfony\Component\DomCrawler\Crawler; |
||
7 | use Symfony\Component\BrowserKit\CookieJar; |
||
8 | |||
9 | /** |
||
10 | * Client to scrap article/news contents from serveral news sources |
||
11 | * |
||
12 | * @author Zeid Rashwani <[email protected]> |
||
13 | */ |
||
14 | class Client |
||
15 | { |
||
16 | |||
17 | protected $scrapClient; |
||
18 | protected $adaptersList = ['Microdata', 'HAtom', 'OpenGraph', 'JsonLD', 'Parsely', 'Default']; |
||
19 | |||
20 | /** |
||
21 | * Adapter to scrap content |
||
22 | * @var Adapters\AbstractAdapter |
||
23 | */ |
||
24 | protected $adapter; |
||
25 | |||
26 | /** |
||
27 | * Constructor |
||
28 | */ |
||
29 | public function __construct($adapter_name = null, CookieJar $cookie_jar = null) |
||
30 | { |
||
31 | $this->scrapClient = new GoutteClient([], null, $cookie_jar); |
||
32 | |||
33 | $this->scrapClient->followRedirects(); |
||
34 | $this->scrapClient->getClient()->setDefaultOption( |
||
35 | 'config/curl/'. |
||
36 | CURLOPT_SSL_VERIFYHOST, |
||
37 | false |
||
38 | ); |
||
39 | $this->scrapClient->getClient()->setDefaultOption( |
||
40 | 'config/curl/'. |
||
41 | CURLOPT_SSL_VERIFYPEER, |
||
42 | false |
||
43 | ); |
||
44 | |||
45 | $this->setAdapter($adapter_name); |
||
46 | } |
||
47 | |||
48 | /** |
||
49 | * Getting selected adapter |
||
50 | * @return Adapters\AbstractAdapter |
||
51 | */ |
||
52 | public function getAdapter() |
||
53 | { |
||
54 | return $this->adapter; |
||
55 | } |
||
56 | |||
57 | /** |
||
58 | * Setting adapter preferred for scrapping |
||
59 | * @param string $adapter_name |
||
60 | * @throws \Exception |
||
61 | */ |
||
62 | public function setAdapter($adapter_name) |
||
63 | { |
||
64 | $adapterClass = "\Zrashwani\NewsScrapper\Adapters\\".$adapter_name."Adapter"; |
||
65 | if (class_exists($adapterClass)) { |
||
66 | $this->adapter = new $adapterClass(); |
||
67 | } else { |
||
68 | $this->adapter = null; |
||
69 | } |
||
70 | |||
71 | return $this; |
||
72 | } |
||
73 | |||
74 | /** |
||
75 | * scrap one source of news |
||
76 | * @param string $baseUrl url to scrap list of news from |
||
77 | * @param string $linkSelector css selector for news links in page |
||
78 | * @param int|NULL $limit limit of news article to scrap, |
||
79 | * if not set it will scrap all matching the selector |
||
80 | * @return array array of article items scrapped |
||
81 | */ |
||
82 | public function scrapLinkGroup($baseUrl, $linkSelector, $limit = null) |
||
83 | { |
||
84 | $crawler = $this->scrapClient->request('GET', $baseUrl); |
||
85 | |||
86 | $scrap_result = array(); |
||
87 | $theAdapter = new Adapters\DefaultAdapter(); |
||
88 | $theAdapter->currentUrl = $baseUrl; |
||
89 | |||
90 | $isXpath = Selector::isXPath($linkSelector); |
||
91 | $method = ($isXpath === false) ? 'filter' : 'filterXPath'; |
||
92 | |||
93 | $crawler->$method($linkSelector) |
||
94 | ->each( |
||
95 | function(Crawler $link_node) use (&$scrap_result, $theAdapter, &$limit) { |
||
96 | if (!is_null($limit) && count($scrap_result) >= $limit) { |
||
97 | return; |
||
98 | } |
||
99 | $link = $theAdapter |
||
100 | ->normalizeLink($link_node->attr('href'), true); //remove hash before scrapping |
||
101 | |||
102 | $article_info = $this->getLinkData($link); |
||
103 | $this->setAdapter(''); //reset default adapter after scrapping one link |
||
104 | $scrap_result[$link] = $article_info; |
||
105 | } |
||
106 | ); |
||
107 | |||
108 | return $scrap_result; |
||
109 | } |
||
110 | |||
111 | /** |
||
112 | * Scrap information for single url |
||
113 | * @param string $link |
||
114 | * @return \stdClass |
||
115 | */ |
||
116 | public function getLinkData($link) |
||
117 | { |
||
118 | $article_info = new \stdClass(); |
||
119 | $article_info->url = $link; |
||
120 | |||
121 | $pageCrawler = $this->scrapClient->request('GET', $article_info->url); |
||
122 | |||
123 | $selected_adapter = $this->getAdapter(); |
||
124 | if ($selected_adapter !== null) { |
||
125 | $this->extractPageData($article_info, $pageCrawler, $selected_adapter); |
||
0 ignored issues
–
show
|
|||
126 | } else { //apply smart scrapping by iterating over all adapters |
||
127 | foreach ($this->adaptersList as $adapter_name) { |
||
128 | $this->setAdapter($adapter_name); |
||
129 | $this->extractPageData($article_info, $pageCrawler, $this->getAdapter()); |
||
0 ignored issues
–
show
It seems like
$pageCrawler defined by $this->scrapClient->requ...T', $article_info->url) on line 121 can be null ; however, Zrashwani\NewsScrapper\Client::extractPageData() does not accept null , maybe add an additional type check?
Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code: /** @return stdClass|null */
function mayReturnNull() { }
function doesNotAcceptNull(stdClass $x) { }
// With potential error.
function withoutCheck() {
$x = mayReturnNull();
doesNotAcceptNull($x); // Potential error here.
}
// Safe - Alternative 1
function withCheck1() {
$x = mayReturnNull();
if ( ! $x instanceof stdClass) {
throw new \LogicException('$x must be defined.');
}
doesNotAcceptNull($x);
}
// Safe - Alternative 2
function withCheck2() {
$x = mayReturnNull();
if ($x instanceof stdClass) {
doesNotAcceptNull($x);
}
}
![]() |
|||
130 | } |
||
131 | } |
||
132 | |||
133 | |||
134 | return $article_info; |
||
135 | } |
||
136 | |||
137 | /** |
||
138 | * Extracting page data from domCrawler according to rules defined by adapter |
||
139 | * @param \stdClass $article_info |
||
140 | * @param Crawler $pageCrawler |
||
141 | * @param \Zrashwani\NewsScrapper\Adapters\AbstractAdapter $adapter adapter used for scrapping |
||
142 | */ |
||
143 | protected function extractPageData( |
||
144 | $article_info, |
||
145 | Crawler $pageCrawler, |
||
146 | Adapters\AbstractAdapter $adapter |
||
147 | ) { |
||
148 | $adapter->currentUrl = $article_info->url; //associate link url to adapter |
||
149 | |||
150 | $article_info->title = empty($article_info->title) === true ? |
||
151 | $adapter->extractTitle($pageCrawler) : $article_info->title; |
||
152 | $article_info->image = empty($article_info->image) === true ? |
||
153 | $adapter->extractImage($pageCrawler, $article_info->url) : $article_info->image; |
||
0 ignored issues
–
show
The call to
AbstractAdapter::extractImage() has too many arguments starting with $article_info->url .
This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue. If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress. In this case you can add the ![]() |
|||
154 | $article_info->description = empty($article_info->description) === true ? |
||
155 | $adapter->extractDescription($pageCrawler) : $article_info->description; |
||
156 | $article_info->keywords = !isset($article_info->keywords) || count($article_info->keywords) === 0 ? |
||
157 | $adapter->extractKeywords($pageCrawler) : $article_info->keywords; |
||
158 | |||
159 | $article_info->author = empty($article_info->author) === true ? |
||
160 | $adapter->extractAuthor($pageCrawler) : $article_info->author; |
||
161 | $article_info->publishDate = empty($article_info->publishDate) === true ? |
||
162 | $adapter->extractPublishDate($pageCrawler) : $article_info->publishDate; |
||
163 | $article_info->body = empty($article_info->body) === true ? |
||
164 | $adapter->extractBody($pageCrawler) : $article_info->body; |
||
165 | |||
166 | } |
||
167 | } |
||
168 |
Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code: