CurlFetchWebdata   A
last analyzed

Complexity

Total Complexity 33

Size/Duplication

Total Lines 341
Duplicated Lines 0 %

Test Coverage

Coverage 95%

Importance

Changes 0
Metric Value
eloc 97
dl 0
loc 341
ccs 76
cts 80
cp 0.95
rs 9.76
c 0
b 0
f 0
wmc 33

10 Methods

Rating   Name   Duplication   Size   Complexity  
A get_url_data() 0 17 4
A __construct() 0 5 1
A _setOptions() 0 22 3
A _headerCallback() 0 13 2
A result_raw() 0 10 2
A result() 0 11 2
B _curlRequest() 0 52 10
A _getRedirectURL() 0 14 3
A _redirect() 0 6 1
A _buildPostData() 0 17 5
1
<?php
2
3
/**
4
 * Provides a cURL interface for fetching files and submitting requests to sites
5
 *
6
 * @package   ElkArte Forum
7
 * @copyright ElkArte Forum contributors
8
 * @license   BSD http://opensource.org/licenses/BSD-3-Clause (see accompanying LICENSE.txt file)
9
 *
10
 * @version 2.0 dev
11
 *
12
 */
13
14
namespace ElkArte\Http;
15
16
/**
17
 * Simple cURL class to fetch a web page
18
 * Properly redirects even with safe mode and basedir restrictions
19
 * Can provide simple post options to a page
20
 *
21
 * Load class
22
 * Initiate as
23
 *  - $fetch_data = new CurlFetchWebdata();
24
 *  - optionally pass an array of cURL options and redirect count
25
 *  - CurlFetchWebdata(cURL options array, Max redirects);
26
 *  - $fetch_data = new CurlFetchWebdata(array(CURLOPT_SSL_VERIFYPEER => 1), 5);
27
 *
28
 * Make the call
29
 *  - $fetch_data->get_url_data('http://www.adomain.org'); // fetch a page
30
 *  - $fetch_data->get_url_data('http://www.adomain.org', array('user' => 'name', 'password' => 'password')); // post to a page
31
 *  - $fetch_data->get_url_data('http://www.adomain.org', parameter1&parameter2&parameter3); // post to a page
32
 *
33
 * Get the data
34
 *  - $fetch_data->result('body'); // just the page content
35
 *  - $fetch_data->result(); // an array of results, body, header, http result codes
36
 *  - $fetch_data->result_raw(); // show all results of all calls (in the event of a redirect)
37
 *  - $fetch_data->result_raw(x); // show all results of call x
38
 */
39
class CurlFetchWebdata
40
{
41
	/**
42
	 * Set the default items for this class
43
	 *
44
	 * @var array
45
	 */
46
	private $default_options = [
47
		CURLOPT_RETURNTRANSFER => true, // Get returned value as a string (don't output it)
48
		CURLOPT_HEADER => true, // We need the headers to do our own redirect
49
		CURLOPT_FOLLOWLOCATION => false, // Don't follow, we will do it ourselves so safe mode and open_basedir will dig it
50
		CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14931', // set a normal looking user agent
51
		CURLOPT_CONNECTTIMEOUT => 10, // Don't wait forever on a connection
52
		CURLOPT_TIMEOUT => 10, // A page should load in this amount of time
53
		CURLOPT_MAXREDIRS => 3, // stop after this many redirects
54
		CURLOPT_ENCODING => 'gzip,deflate', // accept gzip and decode it
55
		CURLOPT_SSL_VERIFYPEER => false, // stop cURL from verifying the peer's certificate
56
		CURLOPT_SSL_VERIFYHOST => 0, // stop cURL from verifying the peer's host
57
		CURLOPT_POST => false, // no post data unless its passed
58
		CURLOPT_HTTPHEADER => ['Accept-Encoding: gzip,compress,identity'], // no special headers unless supplied
59
	];
60
61
	/** @var int Holds the passed or default value for redirects */
62
	private $_max_redirect;
63
64
	/** @var int Holds the current redirect count for the request */
65
	private $_current_redirect = 0;
66
67
	/** @var array Holds the passed user options array */
68
	private $_user_options;
69
70
	/** @var string Holds any data that will be posted to a form */
71
	private $_post_data = '';
72
73
	/** @var string[] Holds the response to the cURL request, headers, data, code, etc */
74
	private $_response = [];
75
76
	/** @var array Holds response headers to the request */
77
	private $_headers = [];
78
79
	/** @var array Holds the options for this request */
80
	private $_options = [];
81
82
	/**
83
	 * Start the cURL object
84
	 *
85
	 * - Allow for user override values
86
	 *
87
	 * @param array $options cURL options as an array
88
	 * @param int $max_redirect Maximum number of redirects
89
	 */
90
	public function __construct($options = [], $max_redirect = 3)
91
	{
92
		// Initialize class variables
93
		$this->_max_redirect = (int) $max_redirect;
94
		$this->_user_options = $options;
95
	}
96
97
	/**
98
	 * Main calling function
99
	 *
100
	 * What it does:
101
	 *
102
	 * - Will request the page data from a given $url
103
	 * - Optionally will post data to the page form if post data is supplied
104
	 * - Passed arrays will be converted to a post string joined with &'s
105
	 * - Calls _setOptions to set the curl opts array values based on the defaults and user input
106
	 *
107
	 * @param string $url the site we are going to fetch
108
	 * @param array|string $post_data data to send in the curl request as post data
109
	 *
110
	 * @return CurlFetchWebdata
111
	 */
112
	public function get_url_data($url, $post_data = [])
113
	{
114
		// POSTing some data perhaps?
115
		if (!empty($post_data) && is_array($post_data))
116
		{
117 8
			$this->_post_data = $this->_buildPostData($post_data);
118
		}
119
		elseif (!empty($post_data))
120 8
		{
121 8
			$this->_post_data = trim($post_data);
122 8
		}
123
124
		// Set the options and get it
125
		$this->_setOptions();
126
		$this->_curlRequest(str_replace(' ', '%20', $url));
127
128
		return $this;
129
	}
130
131
	/**
132
	 * Takes supplied POST data and url encodes it
133
	 *
134
	 * What it does:
135
	 *
136
	 * - Forms the date (for post) in to a string var=xyz&var2=abc&var3=123
137
	 * - Drops vars with @ since we don't support sending files (uploading)
138
	 *
139 8
	 * @param array $post_data
140
	 *
141
	 * @return array|string
142 8
	 */
143
	private function _buildPostData($post_data)
144 2
	{
145
		if (is_array($post_data))
0 ignored issues
show
introduced by
The condition is_array($post_data) is always true.
Loading history...
146 6
		{
147
			$post_vars = [];
148
149
			// Build the post data, drop ones with leading @'s since those can be used to send files,
150
			// we don't support that.
151
			foreach ($post_data as $name => $value)
152 8
			{
153 8
				$post_vars[] = $name . '=' . urlencode(($value === '' || $value[0] === '@') ? '' : $value);
154
			}
155 8
156
			return implode('&', $post_vars);
157
		}
158
159
		return $post_data;
160
	}
161
162
	/**
163
	 * Sets the final cURL options for the current call
164
	 *
165
	 * What it does:
166
	 *
167
	 * - Overwrites our default values with user supplied ones or appends new user ones to what we have
168
	 * - Sets the callback function now that $this exists
169
	 *
170 2
	 * @uses _headerCallback()
171
	 */
172 2
	private function _setOptions()
173
	{
174 2
		// Callback to parse the returned headers, if any
175
		$this->default_options[CURLOPT_HEADERFUNCTION] = fn($cr, $header) => $this->_headerCallback($cr, $header);
176
177 2
		// Any user options to account for
178
		if (is_array($this->_user_options))
0 ignored issues
show
introduced by
The condition is_array($this->_user_options) is always true.
Loading history...
179 2
		{
180
			$keys = array_merge(array_keys($this->default_options), array_keys($this->_user_options));
181
			$vals = array_merge($this->default_options, $this->_user_options);
182 2
			$this->_options = array_combine($keys, $vals);
183
		}
184
		else
185
		{
186
			$this->_options = $this->default_options;
187
		}
188
189
		// POST data options, here we don't allow any override
190
		if (!empty($this->_post_data))
191
		{
192
			$this->_options[CURLOPT_POST] = 1;
193
			$this->_options[CURLOPT_POSTFIELDS] = $this->_post_data;
194
		}
195
	}
196
197
	/**
198
	 * Callback function to parse returned headers
199
	 *
200 8
	 * What it does:
201
	 *
202
	 * - lowercase everything to make it consistent
203 8
	 *
204
	 * @param object $cr Not used but passed by the cURL agent
205 8
	 * @param string $header The headers received
206
	 *
207
	 * @return int
208
	 */
209 8
	private function _headerCallback($cr, $header)
0 ignored issues
show
Unused Code introduced by
The parameter $cr is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

209
	private function _headerCallback(/** @scrutinizer ignore-unused */ $cr, $header)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
210
	{
211 8
		$_header = trim($header);
212 8
		$temp = explode(': ', $_header, 2);
213 8
214
		// Set proper headers only
215
		if (isset($temp[0], $temp[1]))
216
		{
217
			$this->_headers[strtolower($temp[0])] = trim($temp[1]);
218
		}
219
220
		// Return the length of what was *passed* unless you want a Failed writing header error ;)
221 8
		return strlen($header);
222
	}
223 2
224 2
	/**
225
	 * Makes the actual cURL call
226 8
	 *
227
	 * What it does
228
	 * - Store responses (url, code, error, headers, body) in the response array
229
	 * - Detects 301, 302, 307 codes and will redirect to the given response header location
230
	 *
231
	 * @param string $url site to fetch
232
	 * @param bool $redirect flag to indicate if this was a redirect request or not
233
	 *
234
	 * @return bool
235
	 */
236
	private function _curlRequest($url, $redirect = false)
237
	{
238
		// We do have a url I hope
239
		if (trim($url) === '')
240 8
		{
241
			return false;
242 8
		}
243 8
244
		$this->_options[CURLOPT_URL] = $url;
245
246 8
		// If we have not already been redirected, set it up so we can
247
		if (!$redirect)
248 8
		{
249
			$this->_current_redirect = 1;
250
			$this->_response = [];
251
		}
252 8
253
		// Initialize the curl object and make the call
254
		$cr = curl_init();
255
		curl_setopt_array($cr, $this->_options);
256
		curl_exec($cr);
257
258
		// Get what was returned
259
		$curl_info = curl_getinfo($cr);
260
		$curl_content = curl_multi_getcontent($cr);
261
		$url = $curl_info['url']; // Last effective URL
262
		$http_code = $curl_info['http_code']; // Last HTTP code
263
		$body = (curl_error($cr) === '') ? substr($curl_content, $curl_info['header_size']) : false;
264
		$error = (curl_error($cr) !== '') ? curl_error($cr) : false;
265
266
		// Close this request
267 8
		curl_close($cr);
268
269
		// Store this 'loops' data, someone may want all of these :O
270 8
		$this->_response[] = [
271
			'url' => $url,
272
			'code' => $http_code,
273
			'error' => $error,
274
			'size' => empty($curl_info['download_content_length']) ? 0 : $curl_info['download_content_length'],
275
			'headers' => empty($this->_headers) ? false : $this->_headers,
276 8
			'body' => $body,
277
		];
278
279
		// If this a redirect with a location header and we have not given up, then we play it again Sam
280 8
		if (!empty($this->_headers['location']) && $this->_current_redirect <= $this->_max_redirect && preg_match('~30[127]~', $http_code) === 1)
281
		{
282 8
			$this->_current_redirect++;
283 8
			$header_location = $this->_getRedirectURL($url, $this->_headers['location']);
284
			$this->_redirect($header_location, $url);
285
		}
286
287 8
		return true;
288 8
	}
289 8
290
	/**
291
	 * Used if being redirected to ensure we have a fully qualified address
292 8
	 *
293 8
	 * - Returns the new url location for the redirect
294 8
	 *
295 8
	 * @param string $last_url URL where we went to
296 8
	 * @param string $new_url URL where we were redirected to
297 8
	 *
298
	 * @return string
299
	 */
300 8
	private function _getRedirectURL($last_url = '', $new_url = '')
301
	{
302
		// Get the elements for these urls
303 8
		$last_url_parse = parse_url($last_url);
304 8
		$new_url_parse = parse_url($new_url);
305 8
306 8
		// Redirect headers are often incomplete / relative so we need to make sure they are fully qualified
307 8
		$new_url_parse['path'] = $new_url_parse['path'] ?? (isset($new_url_parse['host']) ? '' : $last_url_parse['path']);
308 8
		$new_url_parse['scheme'] = $new_url_parse['scheme'] ?? $last_url_parse['scheme'];
309 8
		$new_url_parse['host'] = $new_url_parse['host'] ?? $last_url_parse['host'];
310
		$new_url_parse['query'] = $new_url_parse['query'] ?? '';
311
312
		// Build the new URL that was in the http header
313 8
		return $new_url_parse['scheme'] . '://' . $new_url_parse['host'] . $new_url_parse['path'] . (empty($new_url_parse['query']) ? '' : '?' . $new_url_parse['query']);
314
	}
315 2
316 2
	/**
317 2
	 * Called to initiate a redirect from a 301, 302 or 307 header
318
	 *
319
	 * What it does
320 8
	 * - Resets the cURL options for the loop, sets the referrer flag
321
	 *
322
	 * @param string $target_url The URL of the target
323
	 * @param string $referer_url The URL of the link that referred us to the new target
324
	 */
325
	private function _redirect($target_url, $referer_url)
326
	{
327
		// No I last saw that over there ... really, 301, 302, 307
328
		$this->_setOptions();
329
		$this->_options[CURLOPT_REFERER] = $referer_url;
330
		$this->_curlRequest($target_url, true);
331
	}
332
333 2
	/**
334
	 * Used to return the results to the calling program
335
	 *
336 2
	 * What it does:
337 2
	 *
338
	 * - Called as ->result() will return the full final array
339
	 * - Called as ->result('body') to just return the page source of the result
340 2
	 *
341 2
	 * @param string $area used to return an area such as body, header, error
342 2
	 *
343 2
	 * @return string
344
	 */
345
	public function result($area = '')
346 2
	{
347
		$max_result = count($this->_response) - 1;
348
349
		// Just return a specified area or the entire result?
350
		if (trim($area) === '')
351
		{
352
			return $this->_response[$max_result];
353
		}
354
355
		return $this->_response[$max_result][$area] ?? $this->_response[$max_result];
356
	}
357
358 2
	/**
359
	 * Will return all results from all loops (redirects)
360
	 *
361 2
	 * What it does:
362 2
	 *
363 2
	 * - Can be called as ->result_raw(x) where x is a specific loop results.
364 2
	 * - Call as ->result_raw() for everything.
365
	 *
366
	 * @param int|string $response_number
367
	 *
368
	 * @return string|string[]
369
	 */
370
	public function result_raw($response_number = '')
371
	{
372
		if (!is_numeric($response_number))
373
		{
374
			return $this->_response;
375
		}
376
377
		$response_number = min($response_number, count($this->_response) - 1);
378 8
379
		return $this->_response[$response_number];
380 8
	}
381
}
382