This project does not seem to handle request data directly as such no vulnerable execution paths were found.
include
, or for example
via PHP's auto-loading mechanism.
These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | require_once(dirname(__FILE__) . '/fwolflib.php'); |
||
3 | require_once(FWOLFLIB . 'class/curl.php'); |
||
4 | require_once(FWOLFLIB . 'func/download.php'); |
||
5 | require_once(FWOLFLIB . 'func/env.php'); |
||
6 | require_once(FWOLFLIB . 'func/request.php'); |
||
7 | require_once(FWOLFLIB . 'func/url.php'); |
||
8 | |||
9 | |||
10 | /** |
||
11 | * Convert css, js, image in a html file, to save it in ONE file like mht. |
||
12 | * |
||
13 | * @package fwolflib |
||
14 | * @copyright Copyright 2007-2012, Fwolf |
||
15 | * @author Fwolf <[email protected]> |
||
16 | * @since 2007-04-06 |
||
17 | */ |
||
18 | class ToDataUri extends Curl { |
||
19 | /** |
||
20 | * Cache of src already retrieved |
||
21 | * Format: url=>base64_data |
||
22 | * @var array |
||
23 | */ |
||
24 | protected $mCache = array(); |
||
25 | |||
26 | /** |
||
27 | * Charset of original web page |
||
28 | * Show in info block. |
||
29 | * @var string |
||
30 | */ |
||
31 | protected $mCharset = ''; |
||
32 | |||
33 | /** |
||
34 | * Running in cli mode |
||
35 | * Will echo some message directly |
||
36 | * @var boolean |
||
37 | */ |
||
38 | protected $mCliMode = false; |
||
39 | |||
40 | /** |
||
41 | * URI which got error when get |
||
42 | * Only for debug or output propose |
||
43 | * @var array |
||
44 | */ |
||
45 | protected $mGetFailed = array(); |
||
46 | |||
47 | /** |
||
48 | * URI which success retrieved |
||
49 | * @var array |
||
50 | */ |
||
51 | protected $mGetOk = array(); |
||
52 | |||
53 | /** |
||
54 | * Html code get from target |
||
55 | * Change is also done here, so this can be output directly |
||
56 | * @var string |
||
57 | */ |
||
58 | public $mHtml = ''; |
||
59 | |||
60 | /** |
||
61 | * Information of Process, display in footer. (obsolete?) |
||
62 | * @var string |
||
63 | * @see $mMsg |
||
64 | */ |
||
65 | public $mInfo = ''; |
||
66 | |||
67 | /** |
||
68 | * Simple reponse message |
||
69 | * Display below form |
||
70 | * @var string |
||
71 | * @see $mInfo |
||
72 | */ |
||
73 | public $mMsg = ''; |
||
74 | |||
75 | /** |
||
76 | * Retrieve html data |
||
77 | * Auto retrieve html data by url on default, if set to false, $this->mHtml must be set manually. |
||
78 | * @var boolean |
||
79 | * @see $mHtml |
||
80 | */ |
||
81 | public $mRetrieveHtml = true; |
||
82 | |||
83 | /** |
||
84 | * Original url |
||
85 | * The web page, which contains css, js, image |
||
86 | * @var string |
||
87 | */ |
||
88 | public $mUrl = ''; |
||
89 | |||
90 | /** |
||
91 | * Baseurl of target webpage |
||
92 | * eg: http://tld.com/dir/index.html, baseurl is http://tld.com/dir/ |
||
93 | * @var string |
||
94 | */ |
||
95 | protected $mUrlBase = ''; |
||
96 | |||
97 | /** |
||
98 | * http or https, for Baseurl |
||
99 | * @var string |
||
100 | */ |
||
101 | protected $sUrlPlan = ''; |
||
102 | |||
103 | |||
104 | /** |
||
105 | * Construce |
||
106 | * @param string $url |
||
107 | */ |
||
108 | public function __construct($url = '') |
||
109 | { |
||
110 | parent::__construct(); |
||
111 | $this->SetUrl($url); |
||
112 | $this->SetoptSslverify(false); |
||
113 | |||
114 | // Detect cli mode |
||
115 | if (IsCli()) |
||
116 | $this->mCliMode = true; |
||
117 | } // end of func __construct |
||
118 | |||
119 | |||
120 | /** |
||
121 | * Add process information to dom, display at bottom of page |
||
122 | * |
||
123 | * @param DOMDocument $dom |
||
124 | */ |
||
125 | protected function AddInfo (&$dom) { |
||
126 | // :TODO: original url & this script url |
||
127 | // Using dom now, $this->mInfo is string, so...it's obsolete? |
||
128 | |||
129 | $dom_info_ul = $dom->createElement('ul'); |
||
130 | $dom_info_ul->setAttribute('style', 'text-align: left'); |
||
131 | // Original url |
||
132 | $a = $dom->createElement('a', htmlspecialchars($this->mUrl)); |
||
133 | $a->setAttribute('href', $this->mUrl); |
||
134 | $li = $dom->createElement('li', "Original url: "); |
||
135 | $li->appendChild($a); |
||
136 | $dom_info_ul->appendChild($li); |
||
137 | // Original charset |
||
138 | $li = $dom->createElement('li', htmlspecialchars("Original charset: {$this->mCharset}")); |
||
139 | $dom_info_ul->appendChild($li); |
||
140 | // Base url |
||
141 | //$a = $dom->createElement('a', htmlspecialchars($this->mUrlBase)); |
||
142 | //$a->setAttribute('href', $this->mUrlBase); |
||
143 | //$li = $dom->createElement('li', "Baseurl: "); |
||
144 | //$li->appendChild($a); |
||
145 | //$dom_info_ul->appendChild($li); |
||
146 | // Url of this script |
||
147 | if ($this->mCliMode) { |
||
148 | $li = $dom->createElement('li', "Generate using Fwolf's 'Save html all in one file' tools(cli mode php script)."); |
||
149 | } else { |
||
150 | $a = $dom->createElement('a', "Fwolf's 'Save html all in one file' tools"); |
||
151 | $a->setAttribute('href', GetSelfUrl(false)); |
||
152 | $li = $dom->createElement('li', "Generate using: "); |
||
153 | $li->appendChild($a); |
||
154 | } |
||
155 | $dom_info_ul->appendChild($li); |
||
156 | // Generate time |
||
157 | $li = $dom->createElement('li', htmlspecialchars("Generate time: " . date('Y-m-d G:i:s'))); |
||
158 | $dom_info_ul->appendChild($li); |
||
159 | // Resources |
||
160 | $i_getok = count($this->mGetOk); |
||
161 | $i_getfailed = count($this->mGetFailed); |
||
162 | $li = $dom->createElement('li', "Resources(" . ($i_getok + $i_getfailed) . " : √ $i_getok, × $i_getfailed): "); |
||
163 | $dom_info_ul->appendChild($li); |
||
164 | |||
165 | // Baseurl & charset has been set when processed, add resources here |
||
166 | //$this->mInfo .= "Resources: <span style='cursor: hand;'>+</span>"; |
||
167 | //$this->mInfo .= "\n<br />√: " . implode($this->mGetOk, "\n<br />√: "); |
||
168 | //$this->mInfo .= "\n<br />×: " . implode($this->mGetFailed, "\n<br />×: "); |
||
169 | $span = $dom->createElement('span', "+++"); |
||
170 | $span->setAttribute('style', 'cursor: pointer;'); |
||
171 | $span->setAttribute('onclick', "javascript:obj=getElementById('fwolf_todatauri_info_resources_list');if ('none'==obj.style.display || ''==obj.style.display) {obj.style.display='block'; this.textContent='---';} else {obj.style.display='none';this.textContent='+++';}"); |
||
172 | $dom_info_ul->lastChild->appendChild($span); |
||
173 | |||
174 | // Append resources detail list as sub-ol |
||
175 | $dom_resources_ol = $dom->createElement('ol'); |
||
176 | $dom_resources_ol->setAttribute('id', 'fwolf_todatauri_info_resources_list'); |
||
177 | $dom_resources_ol->setAttribute('style', 'display: none;'); |
||
178 | View Code Duplication | foreach ($this->mGetOk as $val) |
|
179 | { |
||
180 | $val = htmlspecialchars($val); |
||
181 | $a = $dom->createElement('a', $val); |
||
182 | $a->setAttribute('href', $val); |
||
183 | $li = $dom->createElement('li', '√: '); |
||
184 | //$li = $dom->createElement('li', $val); |
||
185 | $li->appendChild($a); |
||
186 | $dom_resources_ol->appendChild($li); |
||
187 | } |
||
188 | View Code Duplication | foreach ($this->mGetFailed as $val) |
|
189 | { |
||
190 | $val = htmlspecialchars($val); |
||
191 | $a = $dom->createElement('a', $val); |
||
192 | $a->setAttribute('href', $val); |
||
193 | $li = $dom->createElement('li', '×: '); |
||
194 | //$li = $dom->createElement('li', $val); |
||
195 | $li->appendChild($a); |
||
196 | $dom_resources_ol->appendChild($li); |
||
197 | } |
||
198 | $dom_info_ul->appendChild($dom_resources_ol); |
||
199 | if ($this->mCliMode) |
||
200 | echo "[Done ] Resources: √: " . count($this->mGetOk) . ", ×: " . count($this->mGetFailed) . ".\n"; |
||
201 | |||
202 | // If html contents like this, it have not <body>, so we must create it |
||
203 | // <html> |
||
204 | // <meta http-equiv="refresh" content="0;url=http://www.baidu.com/"> |
||
205 | // </html> |
||
206 | $dom_body = $dom->getElementsByTagName('body'); |
||
207 | if (0 == $dom_body->length) { |
||
208 | // There is no <body> in html, we create it |
||
209 | $body = $dom->createElement('body'); |
||
210 | $dom->getElementsByTagName('html')->item(0)->appendChild($body); |
||
211 | } else { |
||
212 | $body = $dom->getElementsByTagName('body')->item(0); |
||
213 | } |
||
214 | |||
215 | $div = $dom->createElement('div'); |
||
216 | $div->setAttribute('id', 'fwolf_save_file_all_in_one_info'); |
||
217 | $div->setAttribute('style', 'clear: both;'); |
||
218 | $hr = $dom->createElement('hr'); |
||
219 | $hr->setAttribute('style', 'border: 0px; height: 1px; color: #B0C4DE; background-color: #B0C4DE;'); |
||
220 | $div->appendChild($hr); |
||
221 | $div->appendChild($dom_info_ul); |
||
222 | $body->appendChild($div); |
||
223 | } // end of func AddInfo |
||
224 | |||
225 | |||
226 | /** |
||
227 | * With a dom object, do changes I need |
||
228 | * Change all $tag's $attr in dom to data:URI style |
||
229 | * @param DOMDocument $dom DOMDocument object |
||
230 | * @param string $tag |
||
231 | * @param string $attr |
||
232 | * @param array $cond Condition, eg: type=>'text/css' for link css |
||
233 | */ |
||
234 | protected function DomChange(&$dom, $tag, $attr, $cond=array()) |
||
235 | { |
||
236 | $items = $dom->getElementsByTagName($tag); |
||
237 | for ($i=0; $i<$items->length; $i++) |
||
238 | { |
||
239 | $item = $items->item($i); |
||
240 | |||
241 | // Check condition by element attribute |
||
242 | $check = true; |
||
243 | if (!empty($cond)) { |
||
244 | foreach ($cond as $k=>$v) { |
||
245 | if ($v != $item->getAttribute($k)) |
||
246 | $check = false; |
||
247 | } |
||
248 | } |
||
249 | // In-document js have text/javascript also, but src is empty |
||
250 | if (('script' == $tag) && ('' == $item->getAttribute('src'))) |
||
251 | $check = false; |
||
252 | |||
253 | // Do change |
||
254 | if (true == $check) |
||
255 | { |
||
256 | $src = $item->getAttribute($attr); |
||
257 | $src = $this->ParseUrl($src); |
||
258 | // If parse failed, use original src |
||
259 | if (!empty($src)) |
||
260 | $item->setAttribute($attr, $src); |
||
261 | } |
||
262 | } |
||
263 | } // end of func DomChange |
||
264 | |||
265 | |||
266 | /** |
||
267 | * Change embemmed style url in dom |
||
268 | * Linked style alread parse by: |
||
269 | * $this->DomChange($dom, 'link', 'href', array('rel'=>'stylesheet')); |
||
270 | * @param DOMDocument $dom DOMDocument object |
||
271 | */ |
||
272 | protected function DomChangeStyle(&$dom) |
||
273 | { |
||
274 | $items = $dom->getElementsByTagName('style'); |
||
275 | for ($i=0; $i<$items->length; $i++) |
||
276 | { |
||
277 | $item = $items->item($i); |
||
278 | |||
279 | $src = $item->nodeValue; |
||
280 | if (empty($src)) continue; |
||
281 | |||
282 | // Example1, with @import, no url( |
||
283 | // @import "mystyle.css"; |
||
284 | // @import "../hide2.css"; |
||
285 | $ar_regex[0] = "/(@import\s*\(?['\"]([^'\"\(\)\{\}]+)['\"]\s*\)?)/i"; |
||
286 | // Example2, with url(, recardness @import |
||
287 | // url("../hide1a.css"); |
||
288 | // url(../hide1b.css); |
||
289 | $ar_regex[1] = "/(url\s*\(['\"]?\s*([^'\"\(\)\{\}]+)['\"]?\s*\))/i"; |
||
290 | |||
291 | foreach ($ar_regex as $regex) { |
||
292 | //$ar = $this->Match('/(<style[^<]+url\(\s*(\S+)\s*\)[^<]+<\/style>)/i', $src); |
||
293 | $ar = $this->Match($regex, $src); |
||
294 | View Code Duplication | if (!empty($ar)) { |
|
295 | // Do as multi match |
||
296 | if (!is_array($ar[0])) { |
||
297 | $ar1 = array(0=>$ar); |
||
298 | $ar = $ar1; |
||
299 | unset($ar1); |
||
300 | } |
||
301 | // Begin loop |
||
302 | foreach ($ar as $val) { |
||
303 | $s = $this->ParseUrl($val[1]); |
||
304 | if (!empty($s)) { |
||
305 | // Use whole match to do str_replace, because url can be used multi times. |
||
306 | $s = str_replace($val[1], $s, $val[0]); |
||
307 | $src = str_replace($val[0], $s, $src); |
||
308 | } |
||
309 | } |
||
310 | // Write result to dom |
||
311 | $item->nodeValue = $src; |
||
312 | } |
||
313 | } |
||
314 | } |
||
315 | |||
316 | // Embemmed style |
||
317 | // :QUESTION: Is these tags slow down treatment? |
||
318 | $ar_tags = array('a', 'blockquote', 'body', 'button', 'code', 'dd', 'del', 'div', 'dl', 'dt', 'form', 'hr', 'img', 'input', 'li', 'ol', 'option', 'p', 'pre', 'q', 'select', 'small', 'span', 'strong', 'table', 'td', 'textarea', 'th', 'tr', 'ul'); |
||
319 | foreach ($ar_tags as $tag) { |
||
320 | $items = $dom->getElementsByTagName($tag); |
||
321 | $i_items = $items->length; |
||
322 | for ($i=0; $i<$i_items; $i++) |
||
323 | { |
||
324 | $item = $items->item($i); |
||
325 | |||
326 | $src = $item->getAttribute('style'); |
||
327 | if (empty($src)) continue; |
||
328 | |||
329 | // Example2 only, with url(, recardness @import |
||
330 | // url("../hide1a.css"); |
||
331 | // url(../hide1b.css); |
||
332 | $regex = "/(url\s*\(['\"]?\s*([^'\"]+)['\"]?\s*\))/i"; |
||
333 | |||
334 | $ar = $this->Match($regex, $src); |
||
335 | View Code Duplication | if (!empty($ar)) { |
|
336 | // Do as multi match |
||
337 | if (!is_array($ar[0])) { |
||
338 | $ar1 = array(0=>$ar); |
||
339 | $ar = $ar1; |
||
340 | unset($ar1); |
||
341 | } |
||
342 | // Begin loop |
||
343 | foreach ($ar as $val) { |
||
344 | $s = $this->ParseUrl($val[1]); |
||
345 | if (!empty($s)) { |
||
346 | // Use whole match to do str_replace, because url can be used multi times. |
||
347 | $s = str_replace($val[1], $s, $val[0]); |
||
348 | $src = str_replace($val[0], $s, $src); |
||
349 | } |
||
350 | } |
||
351 | // Write result to dom |
||
352 | $item->setAttribute('style', $src); |
||
353 | } |
||
354 | } |
||
355 | } |
||
356 | /* |
||
357 | // Example 1 |
||
358 | // <style type="text/css" media="screen">@import url( http://theme.cache.yo2.cn/wp-content/user_themes/37/3729/style.css );</style> |
||
359 | $ar = $this->Match('/(<style[^<]+url\(\s*(\S+)\s*\)[^<]+<\/style>)/i', $this->mHtml); |
||
360 | if (!empty($ar)) { |
||
361 | // Do as multi match |
||
362 | if (!is_array($ar[0])) { |
||
363 | $ar1 = array(0=>$ar); |
||
364 | $ar = $ar1; |
||
365 | unset($ar1); |
||
366 | } |
||
367 | // Begin loop |
||
368 | foreach ($ar as $val) { |
||
369 | $s = $this->ParseUrl($val[1]); |
||
370 | if (!empty($s)) { |
||
371 | // Use whole match to do str_replace, because url can be used multi times. |
||
372 | $s = str_replace($val[1], $s, $val[0]); |
||
373 | $this->mHtml = str_replace($val[0], $s, $this->mHtml); |
||
374 | } |
||
375 | } |
||
376 | } |
||
377 | */ |
||
378 | } // end of func DomChangeStyle |
||
379 | |||
380 | |||
381 | /** |
||
382 | * Get baseurl from init get |
||
383 | * Baseurl used in get css, js, images |
||
384 | * Must execute close to the init curl_exec |
||
385 | * Baseurl not eq hostname, it may include some dir |
||
386 | * If not, crul stats will change by other get action |
||
387 | */ |
||
388 | protected function GetBaseUrl() |
||
389 | { |
||
390 | // Input URL is a dir or a file -> Use the url webserver uses |
||
391 | // But still will got wrong when url like this: |
||
392 | // $url = 'http://131.2.101.10/sys/phpinfo.php/aa'; |
||
393 | // :TODO: check what link will browser gerenate in upper situation |
||
394 | |||
395 | // Uri need add http/https manually |
||
396 | // curl_getinfo can recoginize dir/file of an address |
||
397 | // so here cannot use $this->mUrl + preg_replace to compute baseurl |
||
398 | $baseurl = curl_getinfo($this->mSh, CURLINFO_EFFECTIVE_URL); |
||
399 | // Got the path part of url, should end with '/', exclude this: |
||
400 | // http://131.2.101.10 |
||
401 | $baseurl = preg_replace('/(http|https)(:\/\/.+)\/[^\/]*$/i', '\1\2', $baseurl); |
||
402 | // Add the missing tailing '/' in some special condition |
||
403 | if ('/' != $baseurl{strlen($baseurl) - 1}) |
||
404 | $baseurl .= '/'; |
||
405 | $this->mUrlBase = $baseurl; |
||
406 | |||
407 | // Url plan |
||
408 | $this->sUrlPlan = UrlPlan($this->mUrlBase); |
||
409 | |||
410 | $this->mInfo .= "Baseurl: $baseurl<br />\n"; |
||
411 | if ($this->mCliMode) |
||
412 | echo "[Curl ] Baseurl: $baseurl\n"; |
||
413 | } // end of func GetBaseUrl |
||
414 | |||
415 | |||
416 | /** |
||
417 | * Check if user input url is safe to retrieve |
||
418 | * @param string $url |
||
419 | * @return boolean |
||
420 | */ |
||
421 | protected function IsSafe($url) |
||
422 | { |
||
423 | $safe = true; |
||
424 | if (13 > strlen($url)) $safe = false; |
||
425 | $url_http = strtolower(substr($url, 0, 8)); |
||
426 | if (('http://' != substr($url_http, 0, 7)) && ('https://' != $url_http)) |
||
427 | $safe = false; |
||
428 | $hostname = preg_replace('/^(http|https):\/\/([^\/]+)\/?.*/i', '\2', $url); |
||
429 | if ('localhost' == substr($hostname, 0, 9)) $safe = false; |
||
430 | if ('127.0.0.1' == substr($hostname, 0, 9)) $safe = false; |
||
431 | if ('2130706433' == substr($hostname, 0, 9)) $safe = false; |
||
432 | if ('192.168.0.' == substr($hostname, 0, 10)) $safe = false; |
||
433 | // :TODO: Can't do with my self |
||
434 | |||
435 | if (false == $safe) |
||
436 | $this->mMsg .= "目标网址不安全,不要折腾我的服务器啦~拜托(" . ip2long($hostname) . ")<br />\n"; |
||
437 | return $safe; |
||
438 | } // end of func IsSafe |
||
439 | |||
440 | |||
441 | /** |
||
442 | * Convert content html to utf8 |
||
443 | * <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> |
||
444 | * @see $mHtml |
||
445 | */ |
||
446 | protected function MbConvert() |
||
447 | { |
||
448 | // Find charset webpage use current |
||
449 | //<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> |
||
450 | //$ar = $this->Match('/(<meta[^;]+;[\s]*charset=(\S+)\"[^>]*>)/i'); |
||
451 | $ar = $this->Match('/(<meta[^>]+content=[^>]+charset=([\w\d-_]+)[\"\'][^>]*>)/i'); |
||
452 | $charset = ''; |
||
453 | // For multi charset declaration |
||
454 | if ((isset($ar[0])) && (is_array($ar[0]))) |
||
455 | $ar = $ar[0]; |
||
456 | if (1 < count($ar)) { |
||
457 | $charset = $ar[1]; |
||
458 | } |
||
459 | //$charset = (1 < count($ar)) ? $ar[1] : ''; |
||
460 | $charset = strtolower($charset); |
||
461 | // Check charset got is valid, if no, detect it |
||
462 | // Discuz! error, I have no other ways to detect current encoding |
||
463 | // v4.0.0, printed page: |
||
464 | //<meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> |
||
465 | if ('charset' == $charset) { |
||
466 | // Treat later |
||
467 | $charset = ''; |
||
468 | //$charset = mb_detect_encoding($this->mHtml, "gb2312, gbk, big5, utf-8"); |
||
469 | //$charset = strtolower($charset); |
||
470 | } |
||
471 | // :THINK: Use mb_check_encoding check again? |
||
472 | |||
473 | // Meta Content-type |
||
474 | $meta = '<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'; |
||
475 | if (!empty($charset)) { |
||
476 | // Remove old markup <!-- charset declare deleted --> |
||
477 | $this->mHtml = str_replace($ar[0], '', $this->mHtml); |
||
478 | // Put meta close to head, so no non-ascii will occur before it |
||
479 | $this->mHtml = preg_replace('/<head[^>]*>/i', $meta, $this->mHtml); |
||
480 | if ('utf-8' != $charset) { |
||
481 | $this->mHtml = mb_convert_encoding($this->mHtml, 'utf-8', $charset); |
||
482 | } |
||
483 | $this->mInfo .= "Original charset: $charset<br />\n"; |
||
484 | } else { |
||
485 | // Doc has no charset meta, force added |
||
486 | $charset = strtolower(mb_detect_encoding($this->mHtml |
||
487 | , "gb2312, gbk, big5, utf-8")); |
||
488 | if ('utf-8' != $charset) { |
||
489 | $this->mHtml = mb_convert_encoding($this->mHtml, 'utf-8', $charset); |
||
490 | $this->mInfo .= "Original charset: $charset<br />\n"; |
||
491 | } |
||
492 | //$this->mHtml = $meta . $this->mHtml; |
||
493 | $this->mHtml = preg_replace('/<head[^>]*>/i', $meta, $this->mHtml); |
||
494 | } |
||
495 | |||
496 | $this->mCharset = $charset; |
||
497 | if ($this->mCliMode) |
||
498 | echo "[Curl ] Original charset: $charset.\n"; |
||
499 | } // end of func MbConvert |
||
500 | |||
501 | |||
502 | /* |
||
503 | * Output - using download |
||
504 | */ |
||
505 | public function OutputDownload() |
||
506 | { |
||
507 | // Name |
||
508 | $filename = preg_replace('/^(http|https):\/\/(.*)/i', '\2', $this->mUrl); |
||
509 | $ar = array('/', '?', '&', ';', '=', ':'); |
||
510 | $filename = str_replace($ar, '_', $filename) . '.html'; |
||
511 | Download($this->mHtml, $filename); |
||
512 | } // end of func OutputDownload |
||
513 | |||
514 | |||
515 | /** |
||
516 | * Begin get webpage & parse it |
||
517 | */ |
||
518 | public function Parse() |
||
519 | { |
||
520 | if (!empty($this->mUrl)) |
||
521 | { |
||
522 | if ($this->mCliMode) |
||
523 | echo "[Curl ] Get html content from $this->mUrl\n"; |
||
524 | $this->SetoptReferer($this->mUrl); |
||
525 | if (true == $this->mRetrieveHtml) |
||
526 | $this->mHtml = $this->Get($this->mUrl); |
||
527 | else { |
||
528 | // Do an dummy Get action, mRs is used in Match() (and/or etc...) |
||
529 | $this->Get($this->mUrl); |
||
530 | $this->mRs = $this->mHtml; |
||
531 | } |
||
532 | |||
533 | //$this->GetBaseUrl(); |
||
534 | if (0 == strlen($this->mHtml)) |
||
535 | { |
||
536 | // Some error happen |
||
537 | $this->mMsg .= curl_error($this->mSh); |
||
538 | if ($this->mCliMode) |
||
539 | echo "[Curl ] Failed.\n"; |
||
540 | } |
||
541 | else |
||
542 | { |
||
543 | if ($this->mCliMode) |
||
544 | echo "[Curl ] Ok, " |
||
545 | . number_format(strlen($this->mRs)) |
||
546 | . " bytes.\n"; |
||
547 | $this->GetBaseUrl(); |
||
548 | // Go ahead |
||
549 | $this->MbConvert(); |
||
550 | |||
551 | // Do some cleanup with html code |
||
552 | $this->PreParse(); |
||
553 | |||
554 | $dom = new DOMDocument(); |
||
555 | // Keep original format when output |
||
556 | $dom->preserveWhiteSpace = true; |
||
557 | //$dom->strictErrorChecking = false; |
||
558 | |||
559 | // :TODO: parse un-wellform html error ? |
||
560 | // This way can erase some un-wellformed html error, like un-supported/un-readable chars etc. |
||
561 | $this->mHtml = mb_convert_encoding($this->mHtml |
||
562 | , 'HTML-ENTITIES', "UTF-8"); |
||
563 | // Seems these warning message can't be erased. |
||
564 | @$dom->loadHTML($this->mHtml); |
||
0 ignored issues
–
show
|
|||
565 | // :TODO: If parse all relative link href, can I make a proxy ? |
||
566 | |||
567 | // Embemmed style, modify html directly, do this 'slow' step first, or maybe with longer html string will take more time. |
||
568 | $this->DomChangeStyle($dom); |
||
569 | |||
570 | $this->DomChange($dom, 'img', 'src'); |
||
571 | //$this->DomChange($dom, 'link', 'href', array('rel'=>'stylesheet', 'type'=>'text/css')); |
||
572 | $this->DomChange($dom, 'link', 'href', array('rel'=>'stylesheet')); |
||
573 | |||
574 | // array('type'=>'text/javascript') |
||
575 | // Js condition not requested anymore |
||
576 | $this->DomChange($dom, 'script', 'src'); |
||
577 | |||
578 | $this->AddInfo($dom); |
||
579 | $this->mHtml = $dom->saveHTML(); |
||
580 | |||
581 | } |
||
582 | } |
||
583 | } // end of func Parse |
||
584 | |||
585 | |||
586 | /** |
||
587 | * Get a url & parse it |
||
588 | * Return value is data:URI format |
||
589 | * @param string $url |
||
590 | * @return string |
||
591 | */ |
||
592 | protected function ParseUrl($url) |
||
593 | { |
||
594 | if (empty($url)) |
||
595 | return ''; |
||
596 | // Uri start from http |
||
597 | $src = strtolower($url); |
||
598 | if (('http://' == substr($src, 0, 7)) || ('https://' == substr($src, 0, 8))) |
||
599 | return $this->ParseUrl2Data($url); |
||
600 | elseif ('//' == substr($src, 0, 2)) { |
||
601 | // For IBM developerworks |
||
602 | return $this->ParseUrl2Data($this->sUrlPlan . ':' . $url); |
||
603 | } else { |
||
604 | // Link baseurl with file needed to parse |
||
605 | if ('/' == $url{0}) |
||
606 | { |
||
607 | // Absolute path, compute start from host name |
||
608 | $baseurl = preg_replace('/(http|https)(:\/\/[^\/]+)\/.*/i', '\1\2', $this->mUrlBase); |
||
609 | $objurl = $baseurl . $url; |
||
610 | } |
||
611 | else |
||
612 | { |
||
613 | // Relative path |
||
614 | $objurl = $this->mUrlBase . $url; |
||
615 | } |
||
616 | |||
617 | // Got result url, parse & return |
||
618 | return $this->ParseUrl2Data($objurl); |
||
619 | } |
||
620 | } // end of func ParseUrl |
||
621 | |||
622 | |||
623 | /** |
||
624 | * Retrieve a http object & return data:URI |
||
625 | * Return empty string when retrieve failed. |
||
626 | * @param string $url |
||
627 | * @return string |
||
628 | */ |
||
629 | protected function ParseUrl2Data($url) |
||
630 | { |
||
631 | if (isset($this->mCache[$url])) |
||
632 | $data = $this->mCache[$url]; |
||
633 | else |
||
634 | { |
||
635 | $rs = $this->Get($url); |
||
636 | if (0 < strlen($this->mRs)) |
||
637 | { |
||
638 | $rs_code = $this->GetLastCode(); |
||
639 | $rs_type = $this->GetLastContentType(); |
||
640 | |||
641 | $data = 'data:' . $rs_type . ';base64,' . base64_encode($rs); |
||
642 | $this->mCache[$url] = $data; |
||
643 | $this->mGetOk[] = $url; |
||
644 | View Code Duplication | if ($this->mCliMode) |
|
645 | echo "[" . substr('000' . strval(count($this->mGetOk) + count($this->mGetFailed)), -3) . " ] √: $url\n"; |
||
646 | } |
||
647 | else |
||
648 | { |
||
649 | // Fail |
||
650 | $data = ''; |
||
651 | $this->mGetFailed[] = $url; |
||
652 | View Code Duplication | if ($this->mCliMode) |
|
653 | echo "[" . substr('000' . strval(count($this->mGetOk) + count($this->mGetFailed)), -3) . " ] ×: $url\n"; |
||
654 | } |
||
655 | } |
||
656 | return $data; |
||
657 | } // end of func ParseUrl2Data |
||
658 | |||
659 | |||
660 | /** |
||
661 | * Cleanup html code before parse |
||
662 | */ |
||
663 | protected function PreParse() { |
||
664 | // These extra xml markup can't be treat well by DOM, remove them. |
||
665 | |||
666 | // Remove <?xml version="1.0" encoding="utf-8".. |
||
667 | $this->mHtml = preg_replace('/<\?xml version=[^>]+>/i', '', $this->mHtml); |
||
668 | // Remove xmlns from: |
||
669 | // <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> |
||
670 | $this->mHtml = preg_replace('/<html\s+xmlns=[^>]+>/i', '<html>', $this->mHtml); |
||
671 | } // end of func PrePare |
||
672 | |||
673 | |||
674 | /** |
||
675 | * Set url of web page to process |
||
676 | * @param string $url |
||
677 | */ |
||
678 | public function SetUrl ($url) { |
||
679 | if (!empty($url) && $this->IsSafe($url)) { |
||
680 | // Convert encoded url(eg: chinese) back to original |
||
681 | $url = urldecode($url); |
||
682 | $this->mUrl = $url; |
||
683 | } |
||
684 | } // end of func SetUrl |
||
685 | |||
686 | |||
687 | } // end of class ToDataUri |
||
688 | ?> |
||
689 |
If you suppress an error, we recommend checking for the error condition explicitly: