1
|
|
|
<?php |
2
|
|
|
require_once(dirname(__FILE__) . '/fwolflib.php'); |
3
|
|
|
require_once(FWOLFLIB . 'class/curl.php'); |
4
|
|
|
require_once(FWOLFLIB . 'func/download.php'); |
5
|
|
|
require_once(FWOLFLIB . 'func/env.php'); |
6
|
|
|
require_once(FWOLFLIB . 'func/request.php'); |
7
|
|
|
require_once(FWOLFLIB . 'func/url.php'); |
8
|
|
|
|
9
|
|
|
|
10
|
|
|
/** |
11
|
|
|
* Convert css, js, image in a html file, to save it in ONE file like mht. |
12
|
|
|
* |
13
|
|
|
* @package fwolflib |
14
|
|
|
* @copyright Copyright 2007-2012, Fwolf |
15
|
|
|
* @author Fwolf <[email protected]> |
16
|
|
|
* @since 2007-04-06 |
17
|
|
|
*/ |
18
|
|
|
class ToDataUri extends Curl { |
|
|
|
|
19
|
|
|
/** |
20
|
|
|
* Cache of src already retrieved |
21
|
|
|
* Format: url=>base64_data |
22
|
|
|
* @var array |
23
|
|
|
*/ |
24
|
|
|
protected $mCache = array(); |
25
|
|
|
|
26
|
|
|
/** |
27
|
|
|
* Charset of original web page |
28
|
|
|
* Show in info block. |
29
|
|
|
* @var string |
30
|
|
|
*/ |
31
|
|
|
protected $mCharset = ''; |
32
|
|
|
|
33
|
|
|
/** |
34
|
|
|
* Running in cli mode |
35
|
|
|
* Will echo some message directly |
36
|
|
|
* @var boolean |
37
|
|
|
*/ |
38
|
|
|
protected $mCliMode = false; |
39
|
|
|
|
40
|
|
|
/** |
41
|
|
|
* URI which got error when get |
42
|
|
|
* Only for debug or output propose |
43
|
|
|
* @var array |
44
|
|
|
*/ |
45
|
|
|
protected $mGetFailed = array(); |
46
|
|
|
|
47
|
|
|
/** |
48
|
|
|
* URI which success retrieved |
49
|
|
|
* @var array |
50
|
|
|
*/ |
51
|
|
|
protected $mGetOk = array(); |
52
|
|
|
|
53
|
|
|
/** |
54
|
|
|
* Html code get from target |
55
|
|
|
* Change is also done here, so this can be output directly |
56
|
|
|
* @var string |
57
|
|
|
*/ |
58
|
|
|
public $mHtml = ''; |
59
|
|
|
|
60
|
|
|
/** |
61
|
|
|
* Information of Process, display in footer. (obsolete?) |
62
|
|
|
* @var string |
63
|
|
|
* @see $mMsg |
64
|
|
|
*/ |
65
|
|
|
public $mInfo = ''; |
66
|
|
|
|
67
|
|
|
/** |
68
|
|
|
* Simple reponse message |
69
|
|
|
* Display below form |
70
|
|
|
* @var string |
71
|
|
|
* @see $mInfo |
72
|
|
|
*/ |
73
|
|
|
public $mMsg = ''; |
74
|
|
|
|
75
|
|
|
/** |
76
|
|
|
* Retrieve html data |
77
|
|
|
* Auto retrieve html data by url on default, if set to false, $this->mHtml must be set manually. |
78
|
|
|
* @var boolean |
79
|
|
|
* @see $mHtml |
80
|
|
|
*/ |
81
|
|
|
public $mRetrieveHtml = true; |
82
|
|
|
|
83
|
|
|
/** |
84
|
|
|
* Original url |
85
|
|
|
* The web page, which contains css, js, image |
86
|
|
|
* @var string |
87
|
|
|
*/ |
88
|
|
|
public $mUrl = ''; |
89
|
|
|
|
90
|
|
|
/** |
91
|
|
|
* Baseurl of target webpage |
92
|
|
|
* eg: http://tld.com/dir/index.html, baseurl is http://tld.com/dir/ |
93
|
|
|
* @var string |
94
|
|
|
*/ |
95
|
|
|
protected $mUrlBase = ''; |
96
|
|
|
|
97
|
|
|
/** |
98
|
|
|
* http or https, for Baseurl |
99
|
|
|
* @var string |
100
|
|
|
*/ |
101
|
|
|
protected $sUrlPlan = ''; |
102
|
|
|
|
103
|
|
|
|
104
|
|
|
/** |
105
|
|
|
* Construce |
106
|
|
|
* @param string $url |
107
|
|
|
*/ |
108
|
|
|
public function __construct($url = '') |
109
|
|
|
{ |
110
|
|
|
parent::__construct(); |
111
|
|
|
$this->SetUrl($url); |
112
|
|
|
$this->SetoptSslverify(false); |
113
|
|
|
|
114
|
|
|
// Detect cli mode |
115
|
|
|
if (IsCli()) |
|
|
|
|
116
|
|
|
$this->mCliMode = true; |
117
|
|
|
} // end of func __construct |
118
|
|
|
|
119
|
|
|
|
120
|
|
|
/** |
121
|
|
|
* Add process information to dom, display at bottom of page |
122
|
|
|
* |
123
|
|
|
* @param DOMDocument $dom |
124
|
|
|
*/ |
125
|
|
|
protected function AddInfo (&$dom) { |
126
|
|
|
// :TODO: original url & this script url |
127
|
|
|
// Using dom now, $this->mInfo is string, so...it's obsolete? |
128
|
|
|
|
129
|
|
|
$dom_info_ul = $dom->createElement('ul'); |
130
|
|
|
$dom_info_ul->setAttribute('style', 'text-align: left'); |
131
|
|
|
// Original url |
132
|
|
|
$a = $dom->createElement('a', htmlspecialchars($this->mUrl)); |
133
|
|
|
$a->setAttribute('href', $this->mUrl); |
134
|
|
|
$li = $dom->createElement('li', "Original url: "); |
135
|
|
|
$li->appendChild($a); |
136
|
|
|
$dom_info_ul->appendChild($li); |
137
|
|
|
// Original charset |
138
|
|
|
$li = $dom->createElement('li', htmlspecialchars("Original charset: {$this->mCharset}")); |
139
|
|
|
$dom_info_ul->appendChild($li); |
140
|
|
|
// Base url |
141
|
|
|
//$a = $dom->createElement('a', htmlspecialchars($this->mUrlBase)); |
142
|
|
|
//$a->setAttribute('href', $this->mUrlBase); |
143
|
|
|
//$li = $dom->createElement('li', "Baseurl: "); |
144
|
|
|
//$li->appendChild($a); |
145
|
|
|
//$dom_info_ul->appendChild($li); |
146
|
|
|
// Url of this script |
147
|
|
|
if ($this->mCliMode) { |
148
|
|
|
$li = $dom->createElement('li', "Generate using Fwolf's 'Save html all in one file' tools(cli mode php script)."); |
149
|
|
|
} else { |
150
|
|
|
$a = $dom->createElement('a', "Fwolf's 'Save html all in one file' tools"); |
151
|
|
|
$a->setAttribute('href', GetSelfUrl(false)); |
|
|
|
|
152
|
|
|
$li = $dom->createElement('li', "Generate using: "); |
153
|
|
|
$li->appendChild($a); |
154
|
|
|
} |
155
|
|
|
$dom_info_ul->appendChild($li); |
156
|
|
|
// Generate time |
157
|
|
|
$li = $dom->createElement('li', htmlspecialchars("Generate time: " . date('Y-m-d G:i:s'))); |
158
|
|
|
$dom_info_ul->appendChild($li); |
159
|
|
|
// Resources |
160
|
|
|
$i_getok = count($this->mGetOk); |
161
|
|
|
$i_getfailed = count($this->mGetFailed); |
162
|
|
|
$li = $dom->createElement('li', "Resources(" . ($i_getok + $i_getfailed) . " : √ $i_getok, × $i_getfailed): "); |
163
|
|
|
$dom_info_ul->appendChild($li); |
164
|
|
|
|
165
|
|
|
// Baseurl & charset has been set when processed, add resources here |
166
|
|
|
//$this->mInfo .= "Resources: <span style='cursor: hand;'>+</span>"; |
167
|
|
|
//$this->mInfo .= "\n<br />√: " . implode($this->mGetOk, "\n<br />√: "); |
168
|
|
|
//$this->mInfo .= "\n<br />×: " . implode($this->mGetFailed, "\n<br />×: "); |
169
|
|
|
$span = $dom->createElement('span', "+++"); |
170
|
|
|
$span->setAttribute('style', 'cursor: pointer;'); |
171
|
|
|
$span->setAttribute('onclick', "javascript:obj=getElementById('fwolf_todatauri_info_resources_list');if ('none'==obj.style.display || ''==obj.style.display) {obj.style.display='block'; this.textContent='---';} else {obj.style.display='none';this.textContent='+++';}"); |
172
|
|
|
$dom_info_ul->lastChild->appendChild($span); |
173
|
|
|
|
174
|
|
|
// Append resources detail list as sub-ol |
175
|
|
|
$dom_resources_ol = $dom->createElement('ol'); |
176
|
|
|
$dom_resources_ol->setAttribute('id', 'fwolf_todatauri_info_resources_list'); |
177
|
|
|
$dom_resources_ol->setAttribute('style', 'display: none;'); |
178
|
|
View Code Duplication |
foreach ($this->mGetOk as $val) |
|
|
|
|
179
|
|
|
{ |
180
|
|
|
$val = htmlspecialchars($val); |
181
|
|
|
$a = $dom->createElement('a', $val); |
182
|
|
|
$a->setAttribute('href', $val); |
183
|
|
|
$li = $dom->createElement('li', '√: '); |
184
|
|
|
//$li = $dom->createElement('li', $val); |
185
|
|
|
$li->appendChild($a); |
186
|
|
|
$dom_resources_ol->appendChild($li); |
187
|
|
|
} |
188
|
|
View Code Duplication |
foreach ($this->mGetFailed as $val) |
|
|
|
|
189
|
|
|
{ |
190
|
|
|
$val = htmlspecialchars($val); |
191
|
|
|
$a = $dom->createElement('a', $val); |
192
|
|
|
$a->setAttribute('href', $val); |
193
|
|
|
$li = $dom->createElement('li', '×: '); |
194
|
|
|
//$li = $dom->createElement('li', $val); |
195
|
|
|
$li->appendChild($a); |
196
|
|
|
$dom_resources_ol->appendChild($li); |
197
|
|
|
} |
198
|
|
|
$dom_info_ul->appendChild($dom_resources_ol); |
199
|
|
|
if ($this->mCliMode) |
200
|
|
|
echo "[Done ] Resources: √: " . count($this->mGetOk) . ", ×: " . count($this->mGetFailed) . ".\n"; |
201
|
|
|
|
202
|
|
|
// If html contents like this, it have not <body>, so we must create it |
203
|
|
|
// <html> |
204
|
|
|
// <meta http-equiv="refresh" content="0;url=http://www.baidu.com/"> |
205
|
|
|
// </html> |
206
|
|
|
$dom_body = $dom->getElementsByTagName('body'); |
207
|
|
|
if (0 == $dom_body->length) { |
208
|
|
|
// There is no <body> in html, we create it |
209
|
|
|
$body = $dom->createElement('body'); |
210
|
|
|
$dom->getElementsByTagName('html')->item(0)->appendChild($body); |
211
|
|
|
} else { |
212
|
|
|
$body = $dom->getElementsByTagName('body')->item(0); |
213
|
|
|
} |
214
|
|
|
|
215
|
|
|
$div = $dom->createElement('div'); |
216
|
|
|
$div->setAttribute('id', 'fwolf_save_file_all_in_one_info'); |
217
|
|
|
$div->setAttribute('style', 'clear: both;'); |
218
|
|
|
$hr = $dom->createElement('hr'); |
219
|
|
|
$hr->setAttribute('style', 'border: 0px; height: 1px; color: #B0C4DE; background-color: #B0C4DE;'); |
220
|
|
|
$div->appendChild($hr); |
221
|
|
|
$div->appendChild($dom_info_ul); |
222
|
|
|
$body->appendChild($div); |
223
|
|
|
} // end of func AddInfo |
224
|
|
|
|
225
|
|
|
|
226
|
|
|
/** |
227
|
|
|
* With a dom object, do changes I need |
228
|
|
|
* Change all $tag's $attr in dom to data:URI style |
229
|
|
|
* @param DOMDocument $dom DOMDocument object |
230
|
|
|
* @param string $tag |
231
|
|
|
* @param string $attr |
232
|
|
|
* @param array $cond Condition, eg: type=>'text/css' for link css |
233
|
|
|
*/ |
234
|
|
|
protected function DomChange(&$dom, $tag, $attr, $cond=array()) |
235
|
|
|
{ |
236
|
|
|
$items = $dom->getElementsByTagName($tag); |
237
|
|
|
for ($i=0; $i<$items->length; $i++) |
238
|
|
|
{ |
239
|
|
|
$item = $items->item($i); |
240
|
|
|
|
241
|
|
|
// Check condition by element attribute |
242
|
|
|
$check = true; |
243
|
|
|
if (!empty($cond)) { |
244
|
|
|
foreach ($cond as $k=>$v) { |
245
|
|
|
if ($v != $item->getAttribute($k)) |
246
|
|
|
$check = false; |
247
|
|
|
} |
248
|
|
|
} |
249
|
|
|
// In-document js have text/javascript also, but src is empty |
250
|
|
|
if (('script' == $tag) && ('' == $item->getAttribute('src'))) |
251
|
|
|
$check = false; |
252
|
|
|
|
253
|
|
|
// Do change |
254
|
|
|
if (true == $check) |
|
|
|
|
255
|
|
|
{ |
256
|
|
|
$src = $item->getAttribute($attr); |
257
|
|
|
$src = $this->ParseUrl($src); |
258
|
|
|
// If parse failed, use original src |
259
|
|
|
if (!empty($src)) |
260
|
|
|
$item->setAttribute($attr, $src); |
261
|
|
|
} |
262
|
|
|
} |
263
|
|
|
} // end of func DomChange |
264
|
|
|
|
265
|
|
|
|
266
|
|
|
/** |
267
|
|
|
* Change embemmed style url in dom |
268
|
|
|
* Linked style alread parse by: |
269
|
|
|
* $this->DomChange($dom, 'link', 'href', array('rel'=>'stylesheet')); |
270
|
|
|
* @param DOMDocument $dom DOMDocument object |
271
|
|
|
*/ |
272
|
|
|
protected function DomChangeStyle(&$dom) |
273
|
|
|
{ |
274
|
|
|
$items = $dom->getElementsByTagName('style'); |
275
|
|
|
for ($i=0; $i<$items->length; $i++) |
276
|
|
|
{ |
277
|
|
|
$item = $items->item($i); |
278
|
|
|
|
279
|
|
|
$src = $item->nodeValue; |
280
|
|
|
if (empty($src)) continue; |
281
|
|
|
|
282
|
|
|
// Example1, with @import, no url( |
283
|
|
|
// @import "mystyle.css"; |
284
|
|
|
// @import "../hide2.css"; |
285
|
|
|
$ar_regex[0] = "/(@import\s*\(?['\"]([^'\"\(\)\{\}]+)['\"]\s*\)?)/i"; |
|
|
|
|
286
|
|
|
// Example2, with url(, recardness @import |
287
|
|
|
// url("../hide1a.css"); |
288
|
|
|
// url(../hide1b.css); |
289
|
|
|
$ar_regex[1] = "/(url\s*\(['\"]?\s*([^'\"\(\)\{\}]+)['\"]?\s*\))/i"; |
|
|
|
|
290
|
|
|
|
291
|
|
|
foreach ($ar_regex as $regex) { |
292
|
|
|
//$ar = $this->Match('/(<style[^<]+url\(\s*(\S+)\s*\)[^<]+<\/style>)/i', $src); |
293
|
|
|
$ar = $this->Match($regex, $src); |
294
|
|
View Code Duplication |
if (!empty($ar)) { |
|
|
|
|
295
|
|
|
// Do as multi match |
296
|
|
|
if (!is_array($ar[0])) { |
297
|
|
|
$ar1 = array(0=>$ar); |
298
|
|
|
$ar = $ar1; |
299
|
|
|
unset($ar1); |
300
|
|
|
} |
301
|
|
|
// Begin loop |
302
|
|
|
foreach ($ar as $val) { |
303
|
|
|
$s = $this->ParseUrl($val[1]); |
304
|
|
|
if (!empty($s)) { |
305
|
|
|
// Use whole match to do str_replace, because url can be used multi times. |
306
|
|
|
$s = str_replace($val[1], $s, $val[0]); |
307
|
|
|
$src = str_replace($val[0], $s, $src); |
308
|
|
|
} |
309
|
|
|
} |
310
|
|
|
// Write result to dom |
311
|
|
|
$item->nodeValue = $src; |
312
|
|
|
} |
313
|
|
|
} |
314
|
|
|
} |
315
|
|
|
|
316
|
|
|
// Embemmed style |
317
|
|
|
// :QUESTION: Is these tags slow down treatment? |
318
|
|
|
$ar_tags = array('a', 'blockquote', 'body', 'button', 'code', 'dd', 'del', 'div', 'dl', 'dt', 'form', 'hr', 'img', 'input', 'li', 'ol', 'option', 'p', 'pre', 'q', 'select', 'small', 'span', 'strong', 'table', 'td', 'textarea', 'th', 'tr', 'ul'); |
319
|
|
|
foreach ($ar_tags as $tag) { |
320
|
|
|
$items = $dom->getElementsByTagName($tag); |
321
|
|
|
$i_items = $items->length; |
322
|
|
|
for ($i=0; $i<$i_items; $i++) |
323
|
|
|
{ |
324
|
|
|
$item = $items->item($i); |
325
|
|
|
|
326
|
|
|
$src = $item->getAttribute('style'); |
327
|
|
|
if (empty($src)) continue; |
328
|
|
|
|
329
|
|
|
// Example2 only, with url(, recardness @import |
330
|
|
|
// url("../hide1a.css"); |
331
|
|
|
// url(../hide1b.css); |
332
|
|
|
$regex = "/(url\s*\(['\"]?\s*([^'\"]+)['\"]?\s*\))/i"; |
333
|
|
|
|
334
|
|
|
$ar = $this->Match($regex, $src); |
335
|
|
View Code Duplication |
if (!empty($ar)) { |
|
|
|
|
336
|
|
|
// Do as multi match |
337
|
|
|
if (!is_array($ar[0])) { |
338
|
|
|
$ar1 = array(0=>$ar); |
339
|
|
|
$ar = $ar1; |
340
|
|
|
unset($ar1); |
341
|
|
|
} |
342
|
|
|
// Begin loop |
343
|
|
|
foreach ($ar as $val) { |
344
|
|
|
$s = $this->ParseUrl($val[1]); |
345
|
|
|
if (!empty($s)) { |
346
|
|
|
// Use whole match to do str_replace, because url can be used multi times. |
347
|
|
|
$s = str_replace($val[1], $s, $val[0]); |
348
|
|
|
$src = str_replace($val[0], $s, $src); |
349
|
|
|
} |
350
|
|
|
} |
351
|
|
|
// Write result to dom |
352
|
|
|
$item->setAttribute('style', $src); |
353
|
|
|
} |
354
|
|
|
} |
355
|
|
|
} |
356
|
|
|
/* |
357
|
|
|
// Example 1 |
358
|
|
|
// <style type="text/css" media="screen">@import url( http://theme.cache.yo2.cn/wp-content/user_themes/37/3729/style.css );</style> |
359
|
|
|
$ar = $this->Match('/(<style[^<]+url\(\s*(\S+)\s*\)[^<]+<\/style>)/i', $this->mHtml); |
360
|
|
|
if (!empty($ar)) { |
361
|
|
|
// Do as multi match |
362
|
|
|
if (!is_array($ar[0])) { |
363
|
|
|
$ar1 = array(0=>$ar); |
364
|
|
|
$ar = $ar1; |
365
|
|
|
unset($ar1); |
366
|
|
|
} |
367
|
|
|
// Begin loop |
368
|
|
|
foreach ($ar as $val) { |
369
|
|
|
$s = $this->ParseUrl($val[1]); |
370
|
|
|
if (!empty($s)) { |
371
|
|
|
// Use whole match to do str_replace, because url can be used multi times. |
372
|
|
|
$s = str_replace($val[1], $s, $val[0]); |
373
|
|
|
$this->mHtml = str_replace($val[0], $s, $this->mHtml); |
374
|
|
|
} |
375
|
|
|
} |
376
|
|
|
} |
377
|
|
|
*/ |
378
|
|
|
} // end of func DomChangeStyle |
379
|
|
|
|
380
|
|
|
|
381
|
|
|
/** |
382
|
|
|
* Get baseurl from init get |
383
|
|
|
* Baseurl used in get css, js, images |
384
|
|
|
* Must execute close to the init curl_exec |
385
|
|
|
* Baseurl not eq hostname, it may include some dir |
386
|
|
|
* If not, crul stats will change by other get action |
387
|
|
|
*/ |
388
|
|
|
protected function GetBaseUrl() |
389
|
|
|
{ |
390
|
|
|
// Input URL is a dir or a file -> Use the url webserver uses |
391
|
|
|
// But still will got wrong when url like this: |
392
|
|
|
// $url = 'http://131.2.101.10/sys/phpinfo.php/aa'; |
393
|
|
|
// :TODO: check what link will browser gerenate in upper situation |
394
|
|
|
|
395
|
|
|
// Uri need add http/https manually |
396
|
|
|
// curl_getinfo can recoginize dir/file of an address |
397
|
|
|
// so here cannot use $this->mUrl + preg_replace to compute baseurl |
398
|
|
|
$baseurl = curl_getinfo($this->mSh, CURLINFO_EFFECTIVE_URL); |
399
|
|
|
// Got the path part of url, should end with '/', exclude this: |
400
|
|
|
// http://131.2.101.10 |
401
|
|
|
$baseurl = preg_replace('/(http|https)(:\/\/.+)\/[^\/]*$/i', '\1\2', $baseurl); |
402
|
|
|
// Add the missing tailing '/' in some special condition |
403
|
|
|
if ('/' != $baseurl{strlen($baseurl) - 1}) |
404
|
|
|
$baseurl .= '/'; |
405
|
|
|
$this->mUrlBase = $baseurl; |
|
|
|
|
406
|
|
|
|
407
|
|
|
// Url plan |
408
|
|
|
$this->sUrlPlan = UrlPlan($this->mUrlBase); |
|
|
|
|
409
|
|
|
|
410
|
|
|
$this->mInfo .= "Baseurl: $baseurl<br />\n"; |
411
|
|
|
if ($this->mCliMode) |
412
|
|
|
echo "[Curl ] Baseurl: $baseurl\n"; |
413
|
|
|
} // end of func GetBaseUrl |
414
|
|
|
|
415
|
|
|
|
416
|
|
|
/** |
417
|
|
|
* Check if user input url is safe to retrieve |
418
|
|
|
* @param string $url |
419
|
|
|
* @return boolean |
420
|
|
|
*/ |
421
|
|
|
protected function IsSafe($url) |
422
|
|
|
{ |
423
|
|
|
$safe = true; |
424
|
|
|
if (13 > strlen($url)) $safe = false; |
425
|
|
|
$url_http = strtolower(substr($url, 0, 8)); |
426
|
|
|
if (('http://' != substr($url_http, 0, 7)) && ('https://' != $url_http)) |
427
|
|
|
$safe = false; |
428
|
|
|
$hostname = preg_replace('/^(http|https):\/\/([^\/]+)\/?.*/i', '\2', $url); |
429
|
|
|
if ('localhost' == substr($hostname, 0, 9)) $safe = false; |
430
|
|
|
if ('127.0.0.1' == substr($hostname, 0, 9)) $safe = false; |
431
|
|
|
if ('2130706433' == substr($hostname, 0, 9)) $safe = false; |
432
|
|
|
if ('192.168.0.' == substr($hostname, 0, 10)) $safe = false; |
433
|
|
|
// :TODO: Can't do with my self |
434
|
|
|
|
435
|
|
|
if (false == $safe) |
|
|
|
|
436
|
|
|
$this->mMsg .= "目标网址不安全,不要折腾我的服务器啦~拜托(" . ip2long($hostname) . ")<br />\n"; |
437
|
|
|
return $safe; |
438
|
|
|
} // end of func IsSafe |
439
|
|
|
|
440
|
|
|
|
441
|
|
|
/** |
442
|
|
|
* Convert content html to utf8 |
443
|
|
|
* <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> |
444
|
|
|
* @see $mHtml |
445
|
|
|
*/ |
446
|
|
|
protected function MbConvert() |
447
|
|
|
{ |
448
|
|
|
// Find charset webpage use current |
449
|
|
|
//<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> |
450
|
|
|
//$ar = $this->Match('/(<meta[^;]+;[\s]*charset=(\S+)\"[^>]*>)/i'); |
451
|
|
|
$ar = $this->Match('/(<meta[^>]+content=[^>]+charset=([\w\d-_]+)[\"\'][^>]*>)/i'); |
452
|
|
|
$charset = ''; |
453
|
|
|
// For multi charset declaration |
454
|
|
|
if ((isset($ar[0])) && (is_array($ar[0]))) |
455
|
|
|
$ar = $ar[0]; |
456
|
|
|
if (1 < count($ar)) { |
457
|
|
|
$charset = $ar[1]; |
458
|
|
|
} |
459
|
|
|
//$charset = (1 < count($ar)) ? $ar[1] : ''; |
460
|
|
|
$charset = strtolower($charset); |
461
|
|
|
// Check charset got is valid, if no, detect it |
462
|
|
|
// Discuz! error, I have no other ways to detect current encoding |
463
|
|
|
// v4.0.0, printed page: |
464
|
|
|
//<meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> |
465
|
|
|
if ('charset' == $charset) { |
466
|
|
|
// Treat later |
467
|
|
|
$charset = ''; |
468
|
|
|
//$charset = mb_detect_encoding($this->mHtml, "gb2312, gbk, big5, utf-8"); |
469
|
|
|
//$charset = strtolower($charset); |
470
|
|
|
} |
471
|
|
|
// :THINK: Use mb_check_encoding check again? |
472
|
|
|
|
473
|
|
|
// Meta Content-type |
474
|
|
|
$meta = '<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />'; |
475
|
|
|
if (!empty($charset)) { |
476
|
|
|
// Remove old markup <!-- charset declare deleted --> |
477
|
|
|
$this->mHtml = str_replace($ar[0], '', $this->mHtml); |
478
|
|
|
// Put meta close to head, so no non-ascii will occur before it |
479
|
|
|
$this->mHtml = preg_replace('/<head[^>]*>/i', $meta, $this->mHtml); |
480
|
|
|
if ('utf-8' != $charset) { |
481
|
|
|
$this->mHtml = mb_convert_encoding($this->mHtml, 'utf-8', $charset); |
482
|
|
|
} |
483
|
|
|
$this->mInfo .= "Original charset: $charset<br />\n"; |
484
|
|
|
} else { |
485
|
|
|
// Doc has no charset meta, force added |
486
|
|
|
$charset = strtolower(mb_detect_encoding($this->mHtml |
487
|
|
|
, "gb2312, gbk, big5, utf-8")); |
488
|
|
|
if ('utf-8' != $charset) { |
489
|
|
|
$this->mHtml = mb_convert_encoding($this->mHtml, 'utf-8', $charset); |
490
|
|
|
$this->mInfo .= "Original charset: $charset<br />\n"; |
491
|
|
|
} |
492
|
|
|
//$this->mHtml = $meta . $this->mHtml; |
493
|
|
|
$this->mHtml = preg_replace('/<head[^>]*>/i', $meta, $this->mHtml); |
494
|
|
|
} |
495
|
|
|
|
496
|
|
|
$this->mCharset = $charset; |
497
|
|
|
if ($this->mCliMode) |
498
|
|
|
echo "[Curl ] Original charset: $charset.\n"; |
499
|
|
|
} // end of func MbConvert |
500
|
|
|
|
501
|
|
|
|
502
|
|
|
/* |
503
|
|
|
* Output - using download |
504
|
|
|
*/ |
505
|
|
|
public function OutputDownload() |
506
|
|
|
{ |
507
|
|
|
// Name |
508
|
|
|
$filename = preg_replace('/^(http|https):\/\/(.*)/i', '\2', $this->mUrl); |
509
|
|
|
$ar = array('/', '?', '&', ';', '=', ':'); |
510
|
|
|
$filename = str_replace($ar, '_', $filename) . '.html'; |
511
|
|
|
Download($this->mHtml, $filename); |
|
|
|
|
512
|
|
|
} // end of func OutputDownload |
513
|
|
|
|
514
|
|
|
|
515
|
|
|
/** |
516
|
|
|
* Begin get webpage & parse it |
517
|
|
|
*/ |
518
|
|
|
public function Parse() |
519
|
|
|
{ |
520
|
|
|
if (!empty($this->mUrl)) |
521
|
|
|
{ |
522
|
|
|
if ($this->mCliMode) |
523
|
|
|
echo "[Curl ] Get html content from $this->mUrl\n"; |
524
|
|
|
$this->SetoptReferer($this->mUrl); |
525
|
|
|
if (true == $this->mRetrieveHtml) |
|
|
|
|
526
|
|
|
$this->mHtml = $this->Get($this->mUrl); |
527
|
|
|
else { |
528
|
|
|
// Do an dummy Get action, mRs is used in Match() (and/or etc...) |
529
|
|
|
$this->Get($this->mUrl); |
530
|
|
|
$this->mRs = $this->mHtml; |
531
|
|
|
} |
532
|
|
|
|
533
|
|
|
//$this->GetBaseUrl(); |
534
|
|
|
if (0 == strlen($this->mHtml)) |
535
|
|
|
{ |
536
|
|
|
// Some error happen |
537
|
|
|
$this->mMsg .= curl_error($this->mSh); |
538
|
|
|
if ($this->mCliMode) |
539
|
|
|
echo "[Curl ] Failed.\n"; |
540
|
|
|
} |
541
|
|
|
else |
542
|
|
|
{ |
543
|
|
|
if ($this->mCliMode) |
544
|
|
|
echo "[Curl ] Ok, " |
545
|
|
|
. number_format(strlen($this->mRs)) |
546
|
|
|
. " bytes.\n"; |
547
|
|
|
$this->GetBaseUrl(); |
548
|
|
|
// Go ahead |
549
|
|
|
$this->MbConvert(); |
550
|
|
|
|
551
|
|
|
// Do some cleanup with html code |
552
|
|
|
$this->PreParse(); |
553
|
|
|
|
554
|
|
|
$dom = new DOMDocument(); |
555
|
|
|
// Keep original format when output |
556
|
|
|
$dom->preserveWhiteSpace = true; |
557
|
|
|
//$dom->strictErrorChecking = false; |
558
|
|
|
|
559
|
|
|
// :TODO: parse un-wellform html error ? |
560
|
|
|
// This way can erase some un-wellformed html error, like un-supported/un-readable chars etc. |
561
|
|
|
$this->mHtml = mb_convert_encoding($this->mHtml |
562
|
|
|
, 'HTML-ENTITIES', "UTF-8"); |
563
|
|
|
// Seems these warning message can't be erased. |
564
|
|
|
@$dom->loadHTML($this->mHtml); |
|
|
|
|
565
|
|
|
// :TODO: If parse all relative link href, can I make a proxy ? |
566
|
|
|
|
567
|
|
|
// Embemmed style, modify html directly, do this 'slow' step first, or maybe with longer html string will take more time. |
568
|
|
|
$this->DomChangeStyle($dom); |
569
|
|
|
|
570
|
|
|
$this->DomChange($dom, 'img', 'src'); |
571
|
|
|
//$this->DomChange($dom, 'link', 'href', array('rel'=>'stylesheet', 'type'=>'text/css')); |
572
|
|
|
$this->DomChange($dom, 'link', 'href', array('rel'=>'stylesheet')); |
573
|
|
|
|
574
|
|
|
// array('type'=>'text/javascript') |
575
|
|
|
// Js condition not requested anymore |
576
|
|
|
$this->DomChange($dom, 'script', 'src'); |
577
|
|
|
|
578
|
|
|
$this->AddInfo($dom); |
579
|
|
|
$this->mHtml = $dom->saveHTML(); |
580
|
|
|
|
581
|
|
|
} |
582
|
|
|
} |
583
|
|
|
} // end of func Parse |
584
|
|
|
|
585
|
|
|
|
586
|
|
|
/** |
587
|
|
|
* Get a url & parse it |
588
|
|
|
* Return value is data:URI format |
589
|
|
|
* @param string $url |
590
|
|
|
* @return string |
591
|
|
|
*/ |
592
|
|
|
protected function ParseUrl($url) |
593
|
|
|
{ |
594
|
|
|
if (empty($url)) |
595
|
|
|
return ''; |
596
|
|
|
// Uri start from http |
597
|
|
|
$src = strtolower($url); |
598
|
|
|
if (('http://' == substr($src, 0, 7)) || ('https://' == substr($src, 0, 8))) |
599
|
|
|
return $this->ParseUrl2Data($url); |
600
|
|
|
elseif ('//' == substr($src, 0, 2)) { |
601
|
|
|
// For IBM developerworks |
602
|
|
|
return $this->ParseUrl2Data($this->sUrlPlan . ':' . $url); |
603
|
|
|
} else { |
604
|
|
|
// Link baseurl with file needed to parse |
605
|
|
|
if ('/' == $url{0}) |
606
|
|
|
{ |
607
|
|
|
// Absolute path, compute start from host name |
608
|
|
|
$baseurl = preg_replace('/(http|https)(:\/\/[^\/]+)\/.*/i', '\1\2', $this->mUrlBase); |
609
|
|
|
$objurl = $baseurl . $url; |
610
|
|
|
} |
611
|
|
|
else |
612
|
|
|
{ |
613
|
|
|
// Relative path |
614
|
|
|
$objurl = $this->mUrlBase . $url; |
615
|
|
|
} |
616
|
|
|
|
617
|
|
|
// Got result url, parse & return |
618
|
|
|
return $this->ParseUrl2Data($objurl); |
619
|
|
|
} |
620
|
|
|
} // end of func ParseUrl |
621
|
|
|
|
622
|
|
|
|
623
|
|
|
/** |
624
|
|
|
* Retrieve a http object & return data:URI |
625
|
|
|
* Return empty string when retrieve failed. |
626
|
|
|
* @param string $url |
627
|
|
|
* @return string |
628
|
|
|
*/ |
629
|
|
|
protected function ParseUrl2Data($url) |
630
|
|
|
{ |
631
|
|
|
if (isset($this->mCache[$url])) |
632
|
|
|
$data = $this->mCache[$url]; |
633
|
|
|
else |
634
|
|
|
{ |
635
|
|
|
$rs = $this->Get($url); |
636
|
|
|
if (0 < strlen($this->mRs)) |
637
|
|
|
{ |
638
|
|
|
$rs_code = $this->GetLastCode(); |
|
|
|
|
639
|
|
|
$rs_type = $this->GetLastContentType(); |
640
|
|
|
|
641
|
|
|
$data = 'data:' . $rs_type . ';base64,' . base64_encode($rs); |
642
|
|
|
$this->mCache[$url] = $data; |
643
|
|
|
$this->mGetOk[] = $url; |
644
|
|
View Code Duplication |
if ($this->mCliMode) |
|
|
|
|
645
|
|
|
echo "[" . substr('000' . strval(count($this->mGetOk) + count($this->mGetFailed)), -3) . " ] √: $url\n"; |
646
|
|
|
} |
647
|
|
|
else |
648
|
|
|
{ |
649
|
|
|
// Fail |
650
|
|
|
$data = ''; |
651
|
|
|
$this->mGetFailed[] = $url; |
652
|
|
View Code Duplication |
if ($this->mCliMode) |
|
|
|
|
653
|
|
|
echo "[" . substr('000' . strval(count($this->mGetOk) + count($this->mGetFailed)), -3) . " ] ×: $url\n"; |
654
|
|
|
} |
655
|
|
|
} |
656
|
|
|
return $data; |
657
|
|
|
} // end of func ParseUrl2Data |
658
|
|
|
|
659
|
|
|
|
660
|
|
|
/** |
661
|
|
|
* Cleanup html code before parse |
662
|
|
|
*/ |
663
|
|
|
protected function PreParse() { |
664
|
|
|
// These extra xml markup can't be treat well by DOM, remove them. |
665
|
|
|
|
666
|
|
|
// Remove <?xml version="1.0" encoding="utf-8".. |
667
|
|
|
$this->mHtml = preg_replace('/<\?xml version=[^>]+>/i', '', $this->mHtml); |
668
|
|
|
// Remove xmlns from: |
669
|
|
|
// <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> |
670
|
|
|
$this->mHtml = preg_replace('/<html\s+xmlns=[^>]+>/i', '<html>', $this->mHtml); |
671
|
|
|
} // end of func PrePare |
672
|
|
|
|
673
|
|
|
|
674
|
|
|
/** |
675
|
|
|
* Set url of web page to process |
676
|
|
|
* @param string $url |
677
|
|
|
*/ |
678
|
|
|
public function SetUrl ($url) { |
679
|
|
|
if (!empty($url) && $this->IsSafe($url)) { |
680
|
|
|
// Convert encoded url(eg: chinese) back to original |
681
|
|
|
$url = urldecode($url); |
682
|
|
|
$this->mUrl = $url; |
683
|
|
|
} |
684
|
|
|
} // end of func SetUrl |
685
|
|
|
|
686
|
|
|
|
687
|
|
|
} // end of class ToDataUri |
688
|
|
|
?> |
|
|
|
|
689
|
|
|
|
This class, trait or interface has been deprecated. The supplier of the file has supplied an explanatory message.
The explanatory message should give you some clue as to whether and when the type will be removed from the class and what other constant to use instead.