fwolf /
fwlib
This project does not seem to handle request data directly as such no vulnerable execution paths were found.
include, or for example
via PHP's auto-loading mechanism.
These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
| 1 | <?php |
||
| 2 | require_once(dirname(__FILE__) . '/fwolflib.php'); |
||
| 3 | require_once(FWOLFLIB . 'class/curl.php'); |
||
| 4 | require_once(FWOLFLIB . 'func/download.php'); |
||
| 5 | require_once(FWOLFLIB . 'func/env.php'); |
||
| 6 | require_once(FWOLFLIB . 'func/request.php'); |
||
| 7 | require_once(FWOLFLIB . 'func/url.php'); |
||
| 8 | |||
| 9 | |||
| 10 | /** |
||
| 11 | * Convert css, js, image in a html file, to save it in ONE file like mht. |
||
| 12 | * |
||
| 13 | * @package fwolflib |
||
| 14 | * @copyright Copyright 2007-2012, Fwolf |
||
| 15 | * @author Fwolf <[email protected]> |
||
| 16 | * @since 2007-04-06 |
||
| 17 | */ |
||
| 18 | class ToDataUri extends Curl { |
||
| 19 | /** |
||
| 20 | * Cache of src already retrieved |
||
| 21 | * Format: url=>base64_data |
||
| 22 | * @var array |
||
| 23 | */ |
||
| 24 | protected $mCache = array(); |
||
| 25 | |||
| 26 | /** |
||
| 27 | * Charset of original web page |
||
| 28 | * Show in info block. |
||
| 29 | * @var string |
||
| 30 | */ |
||
| 31 | protected $mCharset = ''; |
||
| 32 | |||
| 33 | /** |
||
| 34 | * Running in cli mode |
||
| 35 | * Will echo some message directly |
||
| 36 | * @var boolean |
||
| 37 | */ |
||
| 38 | protected $mCliMode = false; |
||
| 39 | |||
| 40 | /** |
||
| 41 | * URI which got error when get |
||
| 42 | * Only for debug or output propose |
||
| 43 | * @var array |
||
| 44 | */ |
||
| 45 | protected $mGetFailed = array(); |
||
| 46 | |||
| 47 | /** |
||
| 48 | * URI which success retrieved |
||
| 49 | * @var array |
||
| 50 | */ |
||
| 51 | protected $mGetOk = array(); |
||
| 52 | |||
| 53 | /** |
||
| 54 | * Html code get from target |
||
| 55 | * Change is also done here, so this can be output directly |
||
| 56 | * @var string |
||
| 57 | */ |
||
| 58 | public $mHtml = ''; |
||
| 59 | |||
| 60 | /** |
||
| 61 | * Information of Process, display in footer. (obsolete?) |
||
| 62 | * @var string |
||
| 63 | * @see $mMsg |
||
| 64 | */ |
||
| 65 | public $mInfo = ''; |
||
| 66 | |||
| 67 | /** |
||
| 68 | * Simple reponse message |
||
| 69 | * Display below form |
||
| 70 | * @var string |
||
| 71 | * @see $mInfo |
||
| 72 | */ |
||
| 73 | public $mMsg = ''; |
||
| 74 | |||
| 75 | /** |
||
| 76 | * Retrieve html data |
||
| 77 | * Auto retrieve html data by url on default, if set to false, $this->mHtml must be set manually. |
||
| 78 | * @var boolean |
||
| 79 | * @see $mHtml |
||
| 80 | */ |
||
| 81 | public $mRetrieveHtml = true; |
||
| 82 | |||
| 83 | /** |
||
| 84 | * Original url |
||
| 85 | * The web page, which contains css, js, image |
||
| 86 | * @var string |
||
| 87 | */ |
||
| 88 | public $mUrl = ''; |
||
| 89 | |||
| 90 | /** |
||
| 91 | * Baseurl of target webpage |
||
| 92 | * eg: http://tld.com/dir/index.html, baseurl is http://tld.com/dir/ |
||
| 93 | * @var string |
||
| 94 | */ |
||
| 95 | protected $mUrlBase = ''; |
||
| 96 | |||
| 97 | /** |
||
| 98 | * http or https, for Baseurl |
||
| 99 | * @var string |
||
| 100 | */ |
||
| 101 | protected $sUrlPlan = ''; |
||
| 102 | |||
| 103 | |||
| 104 | /** |
||
| 105 | * Construce |
||
| 106 | * @param string $url |
||
| 107 | */ |
||
| 108 | public function __construct($url = '') |
||
| 109 | { |
||
| 110 | parent::__construct(); |
||
| 111 | $this->SetUrl($url); |
||
| 112 | $this->SetoptSslverify(false); |
||
| 113 | |||
| 114 | // Detect cli mode |
||
| 115 | if (IsCli()) |
||
| 116 | $this->mCliMode = true; |
||
| 117 | } // end of func __construct |
||
| 118 | |||
| 119 | |||
| 120 | /** |
||
| 121 | * Add process information to dom, display at bottom of page |
||
| 122 | * |
||
| 123 | * @param DOMDocument $dom |
||
| 124 | */ |
||
| 125 | protected function AddInfo (&$dom) { |
||
| 126 | // :TODO: original url & this script url |
||
| 127 | // Using dom now, $this->mInfo is string, so...it's obsolete? |
||
| 128 | |||
| 129 | $dom_info_ul = $dom->createElement('ul'); |
||
| 130 | $dom_info_ul->setAttribute('style', 'text-align: left'); |
||
| 131 | // Original url |
||
| 132 | $a = $dom->createElement('a', htmlspecialchars($this->mUrl)); |
||
| 133 | $a->setAttribute('href', $this->mUrl); |
||
| 134 | $li = $dom->createElement('li', "Original url: "); |
||
| 135 | $li->appendChild($a); |
||
| 136 | $dom_info_ul->appendChild($li); |
||
| 137 | // Original charset |
||
| 138 | $li = $dom->createElement('li', htmlspecialchars("Original charset: {$this->mCharset}")); |
||
| 139 | $dom_info_ul->appendChild($li); |
||
| 140 | // Base url |
||
| 141 | //$a = $dom->createElement('a', htmlspecialchars($this->mUrlBase)); |
||
| 142 | //$a->setAttribute('href', $this->mUrlBase); |
||
| 143 | //$li = $dom->createElement('li', "Baseurl: "); |
||
| 144 | //$li->appendChild($a); |
||
| 145 | //$dom_info_ul->appendChild($li); |
||
| 146 | // Url of this script |
||
| 147 | if ($this->mCliMode) { |
||
| 148 | $li = $dom->createElement('li', "Generate using Fwolf's 'Save html all in one file' tools(cli mode php script)."); |
||
| 149 | } else { |
||
| 150 | $a = $dom->createElement('a', "Fwolf's 'Save html all in one file' tools"); |
||
| 151 | $a->setAttribute('href', GetSelfUrl(false)); |
||
| 152 | $li = $dom->createElement('li', "Generate using: "); |
||
| 153 | $li->appendChild($a); |
||
| 154 | } |
||
| 155 | $dom_info_ul->appendChild($li); |
||
| 156 | // Generate time |
||
| 157 | $li = $dom->createElement('li', htmlspecialchars("Generate time: " . date('Y-m-d G:i:s'))); |
||
| 158 | $dom_info_ul->appendChild($li); |
||
| 159 | // Resources |
||
| 160 | $i_getok = count($this->mGetOk); |
||
| 161 | $i_getfailed = count($this->mGetFailed); |
||
| 162 | $li = $dom->createElement('li', "Resources(" . ($i_getok + $i_getfailed) . " : √ $i_getok, × $i_getfailed): "); |
||
| 163 | $dom_info_ul->appendChild($li); |
||
| 164 | |||
| 165 | // Baseurl & charset has been set when processed, add resources here |
||
| 166 | //$this->mInfo .= "Resources: <span style='cursor: hand;'>+</span>"; |
||
| 167 | //$this->mInfo .= "\n<br />√: " . implode($this->mGetOk, "\n<br />√: "); |
||
| 168 | //$this->mInfo .= "\n<br />×: " . implode($this->mGetFailed, "\n<br />×: "); |
||
| 169 | $span = $dom->createElement('span', "+++"); |
||
| 170 | $span->setAttribute('style', 'cursor: pointer;'); |
||
| 171 | $span->setAttribute('onclick', "javascript:obj=getElementById('fwolf_todatauri_info_resources_list');if ('none'==obj.style.display || ''==obj.style.display) {obj.style.display='block'; this.textContent='---';} else {obj.style.display='none';this.textContent='+++';}"); |
||
| 172 | $dom_info_ul->lastChild->appendChild($span); |
||
| 173 | |||
| 174 | // Append resources detail list as sub-ol |
||
| 175 | $dom_resources_ol = $dom->createElement('ol'); |
||
| 176 | $dom_resources_ol->setAttribute('id', 'fwolf_todatauri_info_resources_list'); |
||
| 177 | $dom_resources_ol->setAttribute('style', 'display: none;'); |
||
| 178 | View Code Duplication | foreach ($this->mGetOk as $val) |
|
| 179 | { |
||
| 180 | $val = htmlspecialchars($val); |
||
| 181 | $a = $dom->createElement('a', $val); |
||
| 182 | $a->setAttribute('href', $val); |
||
| 183 | $li = $dom->createElement('li', '√: '); |
||
| 184 | //$li = $dom->createElement('li', $val); |
||
| 185 | $li->appendChild($a); |
||
| 186 | $dom_resources_ol->appendChild($li); |
||
| 187 | } |
||
| 188 | View Code Duplication | foreach ($this->mGetFailed as $val) |
|
| 189 | { |
||
| 190 | $val = htmlspecialchars($val); |
||
| 191 | $a = $dom->createElement('a', $val); |
||
| 192 | $a->setAttribute('href', $val); |
||
| 193 | $li = $dom->createElement('li', '×: '); |
||
| 194 | //$li = $dom->createElement('li', $val); |
||
| 195 | $li->appendChild($a); |
||
| 196 | $dom_resources_ol->appendChild($li); |
||
| 197 | } |
||
| 198 | $dom_info_ul->appendChild($dom_resources_ol); |
||
| 199 | if ($this->mCliMode) |
||
| 200 | echo "[Done ] Resources: √: " . count($this->mGetOk) . ", ×: " . count($this->mGetFailed) . ".\n"; |
||
| 201 | |||
| 202 | // If html contents like this, it have not <body>, so we must create it |
||
| 203 | // <html> |
||
| 204 | // <meta http-equiv="refresh" content="0;url=http://www.baidu.com/"> |
||
| 205 | // </html> |
||
| 206 | $dom_body = $dom->getElementsByTagName('body'); |
||
| 207 | if (0 == $dom_body->length) { |
||
| 208 | // There is no <body> in html, we create it |
||
| 209 | $body = $dom->createElement('body'); |
||
| 210 | $dom->getElementsByTagName('html')->item(0)->appendChild($body); |
||
| 211 | } else { |
||
| 212 | $body = $dom->getElementsByTagName('body')->item(0); |
||
| 213 | } |
||
| 214 | |||
| 215 | $div = $dom->createElement('div'); |
||
| 216 | $div->setAttribute('id', 'fwolf_save_file_all_in_one_info'); |
||
| 217 | $div->setAttribute('style', 'clear: both;'); |
||
| 218 | $hr = $dom->createElement('hr'); |
||
| 219 | $hr->setAttribute('style', 'border: 0px; height: 1px; color: #B0C4DE; background-color: #B0C4DE;'); |
||
| 220 | $div->appendChild($hr); |
||
| 221 | $div->appendChild($dom_info_ul); |
||
| 222 | $body->appendChild($div); |
||
| 223 | } // end of func AddInfo |
||
| 224 | |||
| 225 | |||
| 226 | /** |
||
| 227 | * With a dom object, do changes I need |
||
| 228 | * Change all $tag's $attr in dom to data:URI style |
||
| 229 | * @param DOMDocument $dom DOMDocument object |
||
| 230 | * @param string $tag |
||
| 231 | * @param string $attr |
||
| 232 | * @param array $cond Condition, eg: type=>'text/css' for link css |
||
| 233 | */ |
||
| 234 | protected function DomChange(&$dom, $tag, $attr, $cond=array()) |
||
| 235 | { |
||
| 236 | $items = $dom->getElementsByTagName($tag); |
||
| 237 | for ($i=0; $i<$items->length; $i++) |
||
| 238 | { |
||
| 239 | $item = $items->item($i); |
||
| 240 | |||
| 241 | // Check condition by element attribute |
||
| 242 | $check = true; |
||
| 243 | if (!empty($cond)) { |
||
| 244 | foreach ($cond as $k=>$v) { |
||
| 245 | if ($v != $item->getAttribute($k)) |
||
| 246 | $check = false; |
||
| 247 | } |
||
| 248 | } |
||
| 249 | // In-document js have text/javascript also, but src is empty |
||
| 250 | if (('script' == $tag) && ('' == $item->getAttribute('src'))) |
||
| 251 | $check = false; |
||
| 252 | |||
| 253 | // Do change |
||
| 254 | if (true == $check) |
||
| 255 | { |
||
| 256 | $src = $item->getAttribute($attr); |
||
| 257 | $src = $this->ParseUrl($src); |
||
| 258 | // If parse failed, use original src |
||
| 259 | if (!empty($src)) |
||
| 260 | $item->setAttribute($attr, $src); |
||
| 261 | } |
||
| 262 | } |
||
| 263 | } // end of func DomChange |
||
| 264 | |||
| 265 | |||
| 266 | /** |
||
| 267 | * Change embemmed style url in dom |
||
| 268 | * Linked style alread parse by: |
||
| 269 | * $this->DomChange($dom, 'link', 'href', array('rel'=>'stylesheet')); |
||
| 270 | * @param DOMDocument $dom DOMDocument object |
||
| 271 | */ |
||
| 272 | protected function DomChangeStyle(&$dom) |
||
| 273 | { |
||
| 274 | $items = $dom->getElementsByTagName('style'); |
||
| 275 | for ($i=0; $i<$items->length; $i++) |
||
| 276 | { |
||
| 277 | $item = $items->item($i); |
||
| 278 | |||
| 279 | $src = $item->nodeValue; |
||
| 280 | if (empty($src)) continue; |
||
| 281 | |||
| 282 | // Example1, with @import, no url( |
||
| 283 | // @import "mystyle.css"; |
||
| 284 | // @import "../hide2.css"; |
||
| 285 | $ar_regex[0] = "/(@import\s*\(?['\"]([^'\"\(\)\{\}]+)['\"]\s*\)?)/i"; |
||
| 286 | // Example2, with url(, recardness @import |
||
| 287 | // url("../hide1a.css"); |
||
| 288 | // url(../hide1b.css); |
||
| 289 | $ar_regex[1] = "/(url\s*\(['\"]?\s*([^'\"\(\)\{\}]+)['\"]?\s*\))/i"; |
||
| 290 | |||
| 291 | foreach ($ar_regex as $regex) { |
||
| 292 | //$ar = $this->Match('/(<style[^<]+url\(\s*(\S+)\s*\)[^<]+<\/style>)/i', $src); |
||
| 293 | $ar = $this->Match($regex, $src); |
||
| 294 | View Code Duplication | if (!empty($ar)) { |
|
| 295 | // Do as multi match |
||
| 296 | if (!is_array($ar[0])) { |
||
| 297 | $ar1 = array(0=>$ar); |
||
| 298 | $ar = $ar1; |
||
| 299 | unset($ar1); |
||
| 300 | } |
||
| 301 | // Begin loop |
||
| 302 | foreach ($ar as $val) { |
||
| 303 | $s = $this->ParseUrl($val[1]); |
||
| 304 | if (!empty($s)) { |
||
| 305 | // Use whole match to do str_replace, because url can be used multi times. |
||
| 306 | $s = str_replace($val[1], $s, $val[0]); |
||
| 307 | $src = str_replace($val[0], $s, $src); |
||
| 308 | } |
||
| 309 | } |
||
| 310 | // Write result to dom |
||
| 311 | $item->nodeValue = $src; |
||
| 312 | } |
||
| 313 | } |
||
| 314 | } |
||
| 315 | |||
| 316 | // Embemmed style |
||
| 317 | // :QUESTION: Is these tags slow down treatment? |
||
| 318 | $ar_tags = array('a', 'blockquote', 'body', 'button', 'code', 'dd', 'del', 'div', 'dl', 'dt', 'form', 'hr', 'img', 'input', 'li', 'ol', 'option', 'p', 'pre', 'q', 'select', 'small', 'span', 'strong', 'table', 'td', 'textarea', 'th', 'tr', 'ul'); |
||
| 319 | foreach ($ar_tags as $tag) { |
||
| 320 | $items = $dom->getElementsByTagName($tag); |
||
| 321 | $i_items = $items->length; |
||
| 322 | for ($i=0; $i<$i_items; $i++) |
||
| 323 | { |
||
| 324 | $item = $items->item($i); |
||
| 325 | |||
| 326 | $src = $item->getAttribute('style'); |
||
| 327 | if (empty($src)) continue; |
||
| 328 | |||
| 329 | // Example2 only, with url(, recardness @import |
||
| 330 | // url("../hide1a.css"); |
||
| 331 | // url(../hide1b.css); |
||
| 332 | $regex = "/(url\s*\(['\"]?\s*([^'\"]+)['\"]?\s*\))/i"; |
||
| 333 | |||
| 334 | $ar = $this->Match($regex, $src); |
||
| 335 | View Code Duplication | if (!empty($ar)) { |
|
| 336 | // Do as multi match |
||
| 337 | if (!is_array($ar[0])) { |
||
| 338 | $ar1 = array(0=>$ar); |
||
| 339 | $ar = $ar1; |
||
| 340 | unset($ar1); |
||
| 341 | } |
||
| 342 | // Begin loop |
||
| 343 | foreach ($ar as $val) { |
||
| 344 | $s = $this->ParseUrl($val[1]); |
||
| 345 | if (!empty($s)) { |
||
| 346 | // Use whole match to do str_replace, because url can be used multi times. |
||
| 347 | $s = str_replace($val[1], $s, $val[0]); |
||
| 348 | $src = str_replace($val[0], $s, $src); |
||
| 349 | } |
||
| 350 | } |
||
| 351 | // Write result to dom |
||
| 352 | $item->setAttribute('style', $src); |
||
| 353 | } |
||
| 354 | } |
||
| 355 | } |
||
| 356 | /* |
||
| 357 | // Example 1 |
||
| 358 | // <style type="text/css" media="screen">@import url( http://theme.cache.yo2.cn/wp-content/user_themes/37/3729/style.css );</style> |
||
| 359 | $ar = $this->Match('/(<style[^<]+url\(\s*(\S+)\s*\)[^<]+<\/style>)/i', $this->mHtml); |
||
| 360 | if (!empty($ar)) { |
||
| 361 | // Do as multi match |
||
| 362 | if (!is_array($ar[0])) { |
||
| 363 | $ar1 = array(0=>$ar); |
||
| 364 | $ar = $ar1; |
||
| 365 | unset($ar1); |
||
| 366 | } |
||
| 367 | // Begin loop |
||
| 368 | foreach ($ar as $val) { |
||
| 369 | $s = $this->ParseUrl($val[1]); |
||
| 370 | if (!empty($s)) { |
||
| 371 | // Use whole match to do str_replace, because url can be used multi times. |
||
| 372 | $s = str_replace($val[1], $s, $val[0]); |
||
| 373 | $this->mHtml = str_replace($val[0], $s, $this->mHtml); |
||
| 374 | } |
||
| 375 | } |
||
| 376 | } |
||
| 377 | */ |
||
| 378 | } // end of func DomChangeStyle |
||
| 379 | |||
| 380 | |||
| 381 | /** |
||
| 382 | * Get baseurl from init get |
||
| 383 | * Baseurl used in get css, js, images |
||
| 384 | * Must execute close to the init curl_exec |
||
| 385 | * Baseurl not eq hostname, it may include some dir |
||
| 386 | * If not, crul stats will change by other get action |
||
| 387 | */ |
||
| 388 | protected function GetBaseUrl() |
||
| 389 | { |
||
| 390 | // Input URL is a dir or a file -> Use the url webserver uses |
||
| 391 | // But still will got wrong when url like this: |
||
| 392 | // $url = 'http://131.2.101.10/sys/phpinfo.php/aa'; |
||
| 393 | // :TODO: check what link will browser gerenate in upper situation |
||
| 394 | |||
| 395 | // Uri need add http/https manually |
||
| 396 | // curl_getinfo can recoginize dir/file of an address |
||
| 397 | // so here cannot use $this->mUrl + preg_replace to compute baseurl |
||
| 398 | $baseurl = curl_getinfo($this->mSh, CURLINFO_EFFECTIVE_URL); |
||
| 399 | // Got the path part of url, should end with '/', exclude this: |
||
| 400 | // http://131.2.101.10 |
||
| 401 | $baseurl = preg_replace('/(http|https)(:\/\/.+)\/[^\/]*$/i', '\1\2', $baseurl); |
||
| 402 | // Add the missing tailing '/' in some special condition |
||
| 403 | if ('/' != $baseurl{strlen($baseurl) - 1}) |
||
| 404 | $baseurl .= '/'; |
||
| 405 | $this->mUrlBase = $baseurl; |
||
| 406 | |||
| 407 | // Url plan |
||
| 408 | $this->sUrlPlan = UrlPlan($this->mUrlBase); |
||
| 409 | |||
| 410 | $this->mInfo .= "Baseurl: $baseurl<br />\n"; |
||
| 411 | if ($this->mCliMode) |
||
| 412 | echo "[Curl ] Baseurl: $baseurl\n"; |
||
| 413 | } // end of func GetBaseUrl |
||
| 414 | |||
| 415 | |||
| 416 | /** |
||
| 417 | * Check if user input url is safe to retrieve |
||
| 418 | * @param string $url |
||
| 419 | * @return boolean |
||
| 420 | */ |
||
| 421 | protected function IsSafe($url) |
||
| 422 | { |
||
| 423 | $safe = true; |
||
| 424 | if (13 > strlen($url)) $safe = false; |
||
| 425 | $url_http = strtolower(substr($url, 0, 8)); |
||
| 426 | if (('http://' != substr($url_http, 0, 7)) && ('https://' != $url_http)) |
||
| 427 | $safe = false; |
||
| 428 | $hostname = preg_replace('/^(http|https):\/\/([^\/]+)\/?.*/i', '\2', $url); |
||
| 429 | if ('localhost' == substr($hostname, 0, 9)) $safe = false; |
||
| 430 | if ('127.0.0.1' == substr($hostname, 0, 9)) $safe = false; |
||
| 431 | if ('2130706433' == substr($hostname, 0, 9)) $safe = false; |
||
| 432 | if ('192.168.0.' == substr($hostname, 0, 10)) $safe = false; |
||
| 433 | // :TODO: Can't do with my self |
||
| 434 | |||
| 435 | if (false == $safe) |
||
| 436 | $this->mMsg .= "目标网址不安全,不要折腾我的服务器啦~拜托(" . ip2long($hostname) . ")<br />\n"; |
||
| 437 | return $safe; |
||
| 438 | } // end of func IsSafe |
||
| 439 | |||
| 440 | |||
| 441 | /** |
||
| 442 | * Convert content html to utf8 |
||
| 443 | * <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> |
||
| 444 | * @see $mHtml |
||
| 445 | */ |
||
| 446 | protected function MbConvert() |
||
| 447 | { |
||
| 448 | // Find charset webpage use current |
||
| 449 | //<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> |
||
| 450 | //$ar = $this->Match('/(<meta[^;]+;[\s]*charset=(\S+)\"[^>]*>)/i'); |
||
| 451 | $ar = $this->Match('/(<meta[^>]+content=[^>]+charset=([\w\d-_]+)[\"\'][^>]*>)/i'); |
||
| 452 | $charset = ''; |
||
| 453 | // For multi charset declaration |
||
| 454 | if ((isset($ar[0])) && (is_array($ar[0]))) |
||
| 455 | $ar = $ar[0]; |
||
| 456 | if (1 < count($ar)) { |
||
| 457 | $charset = $ar[1]; |
||
| 458 | } |
||
| 459 | //$charset = (1 < count($ar)) ? $ar[1] : ''; |
||
| 460 | $charset = strtolower($charset); |
||
| 461 | // Check charset got is valid, if no, detect it |
||
| 462 | // Discuz! error, I have no other ways to detect current encoding |
||
| 463 | // v4.0.0, printed page: |
||
| 464 | //<meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> |
||
| 465 | if ('charset' == $charset) { |
||
| 466 | // Treat later |
||
| 467 | $charset = ''; |
||
| 468 | //$charset = mb_detect_encoding($this->mHtml, "gb2312, gbk, big5, utf-8"); |
||
| 469 | //$charset = strtolower($charset); |
||
| 470 | } |
||
| 471 | // :THINK: Use mb_check_encoding check again? |
||
| 472 | |||
| 473 | // Meta Content-type |
||
| 474 | $meta = '<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'; |
||
| 475 | if (!empty($charset)) { |
||
| 476 | // Remove old markup <!-- charset declare deleted --> |
||
| 477 | $this->mHtml = str_replace($ar[0], '', $this->mHtml); |
||
| 478 | // Put meta close to head, so no non-ascii will occur before it |
||
| 479 | $this->mHtml = preg_replace('/<head[^>]*>/i', $meta, $this->mHtml); |
||
| 480 | if ('utf-8' != $charset) { |
||
| 481 | $this->mHtml = mb_convert_encoding($this->mHtml, 'utf-8', $charset); |
||
| 482 | } |
||
| 483 | $this->mInfo .= "Original charset: $charset<br />\n"; |
||
| 484 | } else { |
||
| 485 | // Doc has no charset meta, force added |
||
| 486 | $charset = strtolower(mb_detect_encoding($this->mHtml |
||
| 487 | , "gb2312, gbk, big5, utf-8")); |
||
| 488 | if ('utf-8' != $charset) { |
||
| 489 | $this->mHtml = mb_convert_encoding($this->mHtml, 'utf-8', $charset); |
||
| 490 | $this->mInfo .= "Original charset: $charset<br />\n"; |
||
| 491 | } |
||
| 492 | //$this->mHtml = $meta . $this->mHtml; |
||
| 493 | $this->mHtml = preg_replace('/<head[^>]*>/i', $meta, $this->mHtml); |
||
| 494 | } |
||
| 495 | |||
| 496 | $this->mCharset = $charset; |
||
| 497 | if ($this->mCliMode) |
||
| 498 | echo "[Curl ] Original charset: $charset.\n"; |
||
| 499 | } // end of func MbConvert |
||
| 500 | |||
| 501 | |||
| 502 | /* |
||
| 503 | * Output - using download |
||
| 504 | */ |
||
| 505 | public function OutputDownload() |
||
| 506 | { |
||
| 507 | // Name |
||
| 508 | $filename = preg_replace('/^(http|https):\/\/(.*)/i', '\2', $this->mUrl); |
||
| 509 | $ar = array('/', '?', '&', ';', '=', ':'); |
||
| 510 | $filename = str_replace($ar, '_', $filename) . '.html'; |
||
| 511 | Download($this->mHtml, $filename); |
||
| 512 | } // end of func OutputDownload |
||
| 513 | |||
| 514 | |||
| 515 | /** |
||
| 516 | * Begin get webpage & parse it |
||
| 517 | */ |
||
| 518 | public function Parse() |
||
| 519 | { |
||
| 520 | if (!empty($this->mUrl)) |
||
| 521 | { |
||
| 522 | if ($this->mCliMode) |
||
| 523 | echo "[Curl ] Get html content from $this->mUrl\n"; |
||
| 524 | $this->SetoptReferer($this->mUrl); |
||
| 525 | if (true == $this->mRetrieveHtml) |
||
| 526 | $this->mHtml = $this->Get($this->mUrl); |
||
| 527 | else { |
||
| 528 | // Do an dummy Get action, mRs is used in Match() (and/or etc...) |
||
| 529 | $this->Get($this->mUrl); |
||
| 530 | $this->mRs = $this->mHtml; |
||
| 531 | } |
||
| 532 | |||
| 533 | //$this->GetBaseUrl(); |
||
| 534 | if (0 == strlen($this->mHtml)) |
||
| 535 | { |
||
| 536 | // Some error happen |
||
| 537 | $this->mMsg .= curl_error($this->mSh); |
||
| 538 | if ($this->mCliMode) |
||
| 539 | echo "[Curl ] Failed.\n"; |
||
| 540 | } |
||
| 541 | else |
||
| 542 | { |
||
| 543 | if ($this->mCliMode) |
||
| 544 | echo "[Curl ] Ok, " |
||
| 545 | . number_format(strlen($this->mRs)) |
||
| 546 | . " bytes.\n"; |
||
| 547 | $this->GetBaseUrl(); |
||
| 548 | // Go ahead |
||
| 549 | $this->MbConvert(); |
||
| 550 | |||
| 551 | // Do some cleanup with html code |
||
| 552 | $this->PreParse(); |
||
| 553 | |||
| 554 | $dom = new DOMDocument(); |
||
| 555 | // Keep original format when output |
||
| 556 | $dom->preserveWhiteSpace = true; |
||
| 557 | //$dom->strictErrorChecking = false; |
||
| 558 | |||
| 559 | // :TODO: parse un-wellform html error ? |
||
| 560 | // This way can erase some un-wellformed html error, like un-supported/un-readable chars etc. |
||
| 561 | $this->mHtml = mb_convert_encoding($this->mHtml |
||
| 562 | , 'HTML-ENTITIES', "UTF-8"); |
||
| 563 | // Seems these warning message can't be erased. |
||
| 564 | @$dom->loadHTML($this->mHtml); |
||
|
0 ignored issues
–
show
|
|||
| 565 | // :TODO: If parse all relative link href, can I make a proxy ? |
||
| 566 | |||
| 567 | // Embemmed style, modify html directly, do this 'slow' step first, or maybe with longer html string will take more time. |
||
| 568 | $this->DomChangeStyle($dom); |
||
| 569 | |||
| 570 | $this->DomChange($dom, 'img', 'src'); |
||
| 571 | //$this->DomChange($dom, 'link', 'href', array('rel'=>'stylesheet', 'type'=>'text/css')); |
||
| 572 | $this->DomChange($dom, 'link', 'href', array('rel'=>'stylesheet')); |
||
| 573 | |||
| 574 | // array('type'=>'text/javascript') |
||
| 575 | // Js condition not requested anymore |
||
| 576 | $this->DomChange($dom, 'script', 'src'); |
||
| 577 | |||
| 578 | $this->AddInfo($dom); |
||
| 579 | $this->mHtml = $dom->saveHTML(); |
||
| 580 | |||
| 581 | } |
||
| 582 | } |
||
| 583 | } // end of func Parse |
||
| 584 | |||
| 585 | |||
| 586 | /** |
||
| 587 | * Get a url & parse it |
||
| 588 | * Return value is data:URI format |
||
| 589 | * @param string $url |
||
| 590 | * @return string |
||
| 591 | */ |
||
| 592 | protected function ParseUrl($url) |
||
| 593 | { |
||
| 594 | if (empty($url)) |
||
| 595 | return ''; |
||
| 596 | // Uri start from http |
||
| 597 | $src = strtolower($url); |
||
| 598 | if (('http://' == substr($src, 0, 7)) || ('https://' == substr($src, 0, 8))) |
||
| 599 | return $this->ParseUrl2Data($url); |
||
| 600 | elseif ('//' == substr($src, 0, 2)) { |
||
| 601 | // For IBM developerworks |
||
| 602 | return $this->ParseUrl2Data($this->sUrlPlan . ':' . $url); |
||
| 603 | } else { |
||
| 604 | // Link baseurl with file needed to parse |
||
| 605 | if ('/' == $url{0}) |
||
| 606 | { |
||
| 607 | // Absolute path, compute start from host name |
||
| 608 | $baseurl = preg_replace('/(http|https)(:\/\/[^\/]+)\/.*/i', '\1\2', $this->mUrlBase); |
||
| 609 | $objurl = $baseurl . $url; |
||
| 610 | } |
||
| 611 | else |
||
| 612 | { |
||
| 613 | // Relative path |
||
| 614 | $objurl = $this->mUrlBase . $url; |
||
| 615 | } |
||
| 616 | |||
| 617 | // Got result url, parse & return |
||
| 618 | return $this->ParseUrl2Data($objurl); |
||
| 619 | } |
||
| 620 | } // end of func ParseUrl |
||
| 621 | |||
| 622 | |||
| 623 | /** |
||
| 624 | * Retrieve a http object & return data:URI |
||
| 625 | * Return empty string when retrieve failed. |
||
| 626 | * @param string $url |
||
| 627 | * @return string |
||
| 628 | */ |
||
| 629 | protected function ParseUrl2Data($url) |
||
| 630 | { |
||
| 631 | if (isset($this->mCache[$url])) |
||
| 632 | $data = $this->mCache[$url]; |
||
| 633 | else |
||
| 634 | { |
||
| 635 | $rs = $this->Get($url); |
||
| 636 | if (0 < strlen($this->mRs)) |
||
| 637 | { |
||
| 638 | $rs_code = $this->GetLastCode(); |
||
| 639 | $rs_type = $this->GetLastContentType(); |
||
| 640 | |||
| 641 | $data = 'data:' . $rs_type . ';base64,' . base64_encode($rs); |
||
| 642 | $this->mCache[$url] = $data; |
||
| 643 | $this->mGetOk[] = $url; |
||
| 644 | View Code Duplication | if ($this->mCliMode) |
|
| 645 | echo "[" . substr('000' . strval(count($this->mGetOk) + count($this->mGetFailed)), -3) . " ] √: $url\n"; |
||
| 646 | } |
||
| 647 | else |
||
| 648 | { |
||
| 649 | // Fail |
||
| 650 | $data = ''; |
||
| 651 | $this->mGetFailed[] = $url; |
||
| 652 | View Code Duplication | if ($this->mCliMode) |
|
| 653 | echo "[" . substr('000' . strval(count($this->mGetOk) + count($this->mGetFailed)), -3) . " ] ×: $url\n"; |
||
| 654 | } |
||
| 655 | } |
||
| 656 | return $data; |
||
| 657 | } // end of func ParseUrl2Data |
||
| 658 | |||
| 659 | |||
| 660 | /** |
||
| 661 | * Cleanup html code before parse |
||
| 662 | */ |
||
| 663 | protected function PreParse() { |
||
| 664 | // These extra xml markup can't be treat well by DOM, remove them. |
||
| 665 | |||
| 666 | // Remove <?xml version="1.0" encoding="utf-8".. |
||
| 667 | $this->mHtml = preg_replace('/<\?xml version=[^>]+>/i', '', $this->mHtml); |
||
| 668 | // Remove xmlns from: |
||
| 669 | // <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> |
||
| 670 | $this->mHtml = preg_replace('/<html\s+xmlns=[^>]+>/i', '<html>', $this->mHtml); |
||
| 671 | } // end of func PrePare |
||
| 672 | |||
| 673 | |||
| 674 | /** |
||
| 675 | * Set url of web page to process |
||
| 676 | * @param string $url |
||
| 677 | */ |
||
| 678 | public function SetUrl ($url) { |
||
| 679 | if (!empty($url) && $this->IsSafe($url)) { |
||
| 680 | // Convert encoded url(eg: chinese) back to original |
||
| 681 | $url = urldecode($url); |
||
| 682 | $this->mUrl = $url; |
||
| 683 | } |
||
| 684 | } // end of func SetUrl |
||
| 685 | |||
| 686 | |||
| 687 | } // end of class ToDataUri |
||
| 688 | ?> |
||
| 689 |
If you suppress an error, we recommend checking for the error condition explicitly: