1 | <?php |
||||||
2 | |||||||
3 | namespace simplehtmldom; |
||||||
4 | |||||||
5 | /** |
||||||
6 | * Website: http://sourceforge.net/projects/simplehtmldom/ |
||||||
7 | * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/). |
||||||
8 | * |
||||||
9 | * Licensed under The MIT License |
||||||
10 | * See the LICENSE file in the project root for more information. |
||||||
11 | * |
||||||
12 | * Authors: |
||||||
13 | * S.C. Chen |
||||||
14 | * John Schlick |
||||||
15 | * Rus Carroll |
||||||
16 | * logmanoriginal |
||||||
17 | * |
||||||
18 | * Contributors: |
||||||
19 | * Yousuke Kumakura |
||||||
20 | * Vadim Voituk |
||||||
21 | * Antcs |
||||||
22 | * |
||||||
23 | * Version Rev. 2.0-RC2 (415) |
||||||
24 | */ |
||||||
25 | include_once __DIR__ . '/constants.php'; |
||||||
26 | include_once __DIR__ . '/HtmlNode.php'; |
||||||
27 | include_once __DIR__ . '/Debug.php'; |
||||||
28 | |||||||
29 | /** |
||||||
30 | * HTMLDocument class. |
||||||
31 | */ |
||||||
32 | class HtmlDocument |
||||||
33 | { |
||||||
34 | /** |
||||||
35 | * HtmlNode instance. |
||||||
36 | * |
||||||
37 | * @var HtmlNode |
||||||
38 | */ |
||||||
39 | public $root = null; |
||||||
40 | public $nodes = []; |
||||||
41 | public $callback = null; |
||||||
42 | public $lowercase = false; |
||||||
43 | public $original_size; |
||||||
44 | public $size; |
||||||
45 | |||||||
46 | protected $pos; |
||||||
47 | protected $doc; |
||||||
48 | protected $char; |
||||||
49 | |||||||
50 | protected $cursor; |
||||||
51 | protected $parent; |
||||||
52 | protected $noise = []; |
||||||
53 | protected $token_blank = " \t\r\n"; |
||||||
54 | protected $token_equal = ' =/>'; |
||||||
55 | protected $token_slash = " />\r\n\t"; |
||||||
56 | protected $token_attr = ' >'; |
||||||
57 | |||||||
58 | public $_charset = ''; |
||||||
59 | public $_target_charset = ''; |
||||||
60 | |||||||
61 | public $default_br_text = ''; |
||||||
62 | public $default_span_text = ''; |
||||||
63 | |||||||
64 | protected $self_closing_tags = [ |
||||||
65 | 'area' => 1, |
||||||
66 | 'base' => 1, |
||||||
67 | 'br' => 1, |
||||||
68 | 'col' => 1, |
||||||
69 | 'embed' => 1, |
||||||
70 | 'hr' => 1, |
||||||
71 | 'img' => 1, |
||||||
72 | 'input' => 1, |
||||||
73 | 'link' => 1, |
||||||
74 | 'meta' => 1, |
||||||
75 | 'param' => 1, |
||||||
76 | 'source' => 1, |
||||||
77 | 'track' => 1, |
||||||
78 | 'wbr' => 1, |
||||||
79 | ]; |
||||||
80 | protected $block_tags = [ |
||||||
81 | 'body' => 1, |
||||||
82 | 'div' => 1, |
||||||
83 | 'form' => 1, |
||||||
84 | 'root' => 1, |
||||||
85 | 'span' => 1, |
||||||
86 | 'table' => 1, |
||||||
87 | ]; |
||||||
88 | protected $optional_closing_tags = [ |
||||||
89 | // Not optional, see |
||||||
90 | // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element |
||||||
91 | 'b' => ['b' => 1], |
||||||
92 | 'dd' => ['dd' => 1, 'dt' => 1], |
||||||
93 | // Not optional, see |
||||||
94 | // https://www.w3.org/TR/html/grouping-content.html#the-dl-element |
||||||
95 | 'dl' => ['dd' => 1, 'dt' => 1], |
||||||
96 | 'dt' => ['dd' => 1, 'dt' => 1], |
||||||
97 | 'li' => ['li' => 1], |
||||||
98 | 'optgroup' => ['optgroup' => 1, 'option' => 1], |
||||||
99 | 'option' => ['optgroup' => 1, 'option' => 1], |
||||||
100 | 'p' => ['p' => 1], |
||||||
101 | 'rp' => ['rp' => 1, 'rt' => 1], |
||||||
102 | 'rt' => ['rp' => 1, 'rt' => 1], |
||||||
103 | 'td' => ['td' => 1, 'th' => 1], |
||||||
104 | 'th' => ['td' => 1, 'th' => 1], |
||||||
105 | 'tr' => ['td' => 1, 'th' => 1, 'tr' => 1], |
||||||
106 | ]; |
||||||
107 | |||||||
108 | public function __call($func, $args) |
||||||
109 | { |
||||||
110 | // Allow users to call methods with lower_case syntax |
||||||
111 | switch ($func) { |
||||||
112 | case 'load_file': |
||||||
113 | $actual_function = 'loadFile'; |
||||||
114 | break; |
||||||
115 | case 'clear': |
||||||
116 | return; /* no-op */ |
||||||
117 | default: |
||||||
118 | trigger_error( |
||||||
119 | 'Call to undefined method ' . __CLASS__ . '::' . $func . '()', |
||||||
120 | E_USER_ERROR |
||||||
121 | ); |
||||||
122 | } |
||||||
123 | |||||||
124 | // phpcs:ignore Generic.Files.LineLength |
||||||
125 | Debug::log(__CLASS__ . '->' . $func . '() has been deprecated and will be removed in the next major version of simplehtmldom. Use ' . __CLASS__ . '->' . $actual_function . '() instead.'); |
||||||
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
![]() |
|||||||
126 | |||||||
127 | return call_user_func_array([$this, $actual_function], $args); |
||||||
128 | } |
||||||
129 | |||||||
130 | public function __construct( |
||||||
131 | $str = null, |
||||||
132 | $lowercase = true, |
||||||
133 | $forceTagsClosed = true, |
||||||
134 | $target_charset = DEFAULT_TARGET_CHARSET, |
||||||
135 | $stripRN = true, |
||||||
136 | $defaultBRText = DEFAULT_BR_TEXT, |
||||||
137 | $defaultSpanText = DEFAULT_SPAN_TEXT, |
||||||
138 | $options = 0 |
||||||
139 | ) { |
||||||
140 | if ($str) { |
||||||
141 | if (preg_match('/^http:\/\//i', $str) || is_file($str)) { |
||||||
142 | $this->load_file($str); |
||||||
0 ignored issues
–
show
The method
load_file() does not exist on simplehtmldom\HtmlDocument . Since you implemented __call , consider adding a @method annotation.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
![]() |
|||||||
143 | } else { |
||||||
144 | $this->load( |
||||||
145 | $str, |
||||||
146 | $lowercase, |
||||||
147 | $stripRN, |
||||||
148 | $defaultBRText, |
||||||
149 | $defaultSpanText, |
||||||
150 | $options |
||||||
151 | ); |
||||||
152 | } |
||||||
153 | } else { |
||||||
154 | $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText); |
||||||
155 | } |
||||||
156 | // Forcing tags to be closed implies that we don't trust the html, but |
||||||
157 | // it can lead to parsing errors if we SHOULD trust the html. |
||||||
158 | if (!$forceTagsClosed) { |
||||||
159 | $this->optional_closing_array = []; |
||||||
0 ignored issues
–
show
|
|||||||
160 | } |
||||||
161 | |||||||
162 | $this->_target_charset = $target_charset; |
||||||
163 | } |
||||||
164 | |||||||
165 | public function __debugInfo() |
||||||
166 | { |
||||||
167 | return [ |
||||||
168 | 'root' => $this->root, |
||||||
169 | 'noise' => empty($this->noise) ? 'none' : $this->noise, |
||||||
170 | 'charset' => $this->_charset, |
||||||
171 | 'target charset' => $this->_target_charset, |
||||||
172 | 'original size' => $this->original_size, |
||||||
173 | ]; |
||||||
174 | } |
||||||
175 | |||||||
176 | public function __destruct() |
||||||
177 | { |
||||||
178 | if (isset($this->nodes)) { |
||||||
179 | foreach ($this->nodes as $n) { |
||||||
180 | $n->clear(); |
||||||
181 | } |
||||||
182 | } |
||||||
183 | } |
||||||
184 | |||||||
185 | public function load( |
||||||
186 | $str, |
||||||
187 | $lowercase = true, |
||||||
188 | $stripRN = true, |
||||||
189 | $defaultBRText = DEFAULT_BR_TEXT, |
||||||
190 | $defaultSpanText = DEFAULT_SPAN_TEXT, |
||||||
191 | $options = 0 |
||||||
192 | ) { |
||||||
193 | // prepare |
||||||
194 | $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText); |
||||||
195 | |||||||
196 | if ($stripRN) { |
||||||
197 | // Temporarily remove any element that shouldn't loose whitespace |
||||||
198 | $this->remove_noise("'<\s*script[^>]*>(.*?)<\s*/\s*script\s*>'is"); |
||||||
199 | $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is"); |
||||||
200 | $this->remove_noise("'<!--(.*?)-->'is"); |
||||||
201 | $this->remove_noise("'<\s*style[^>]*>(.*?)<\s*/\s*style\s*>'is"); |
||||||
202 | $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); |
||||||
203 | |||||||
204 | // Remove whitespace and newlines between tags |
||||||
205 | $this->doc = preg_replace('/\>([\t\s]*[\r\n]^[\t\s]*)\</m', '><', $this->doc); |
||||||
206 | |||||||
207 | // Remove whitespace and newlines in text |
||||||
208 | $this->doc = preg_replace('/([\t\s]*[\r\n]^[\t\s]*)/m', ' ', $this->doc); |
||||||
209 | |||||||
210 | // Restore temporarily removed elements and calculate new size |
||||||
211 | $this->doc = $this->restore_noise($this->doc); |
||||||
212 | $this->size = strlen($this->doc); |
||||||
213 | } |
||||||
214 | |||||||
215 | $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); // server-side script |
||||||
216 | if (count($this->noise)) { |
||||||
217 | // phpcs:ignore Generic.Files.LineLength |
||||||
218 | Debug::log('Support for server-side scripts has been deprecated and will be removed in the next major version of simplehtmldom.'); |
||||||
219 | } |
||||||
220 | |||||||
221 | if ($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts |
||||||
222 | $this->remove_noise("'(\{\w)(.*?)(\})'s", true); |
||||||
223 | // phpcs:ignore Generic.Files.LineLength |
||||||
224 | Debug::log('Support for Smarty scripts has been deprecated and will be removed in the next major version of simplehtmldom.'); |
||||||
225 | } |
||||||
226 | |||||||
227 | // parsing |
||||||
228 | $this->parse($stripRN); |
||||||
229 | // end |
||||||
230 | $this->root->_[HtmlNode::HDOM_INFO_END] = $this->cursor; |
||||||
231 | $this->parse_charset(); |
||||||
232 | $this->decode(); |
||||||
233 | unset($this->doc); |
||||||
234 | |||||||
235 | // make load function chainable |
||||||
236 | return $this; |
||||||
237 | } |
||||||
238 | |||||||
239 | public function set_callback($function_name) |
||||||
240 | { |
||||||
241 | $this->callback = $function_name; |
||||||
242 | } |
||||||
243 | |||||||
244 | public function remove_callback() |
||||||
245 | { |
||||||
246 | $this->callback = null; |
||||||
247 | } |
||||||
248 | |||||||
249 | /** |
||||||
250 | * Save modified html. |
||||||
251 | * |
||||||
252 | * @param string $filepath |
||||||
253 | */ |
||||||
254 | public function save($filepath = '') |
||||||
255 | { |
||||||
256 | $ret = $this->root->innertext(); |
||||||
257 | if ('' !== $filepath) { |
||||||
258 | file_put_contents($filepath, $ret, LOCK_EX); |
||||||
259 | } |
||||||
260 | |||||||
261 | return $ret; |
||||||
262 | } |
||||||
263 | |||||||
264 | /** |
||||||
265 | * Find elements by CSS Selector. |
||||||
266 | * |
||||||
267 | * @param string $selector CSS Selector |
||||||
268 | * @param number|null $idx |
||||||
269 | * @param bool $lowercase |
||||||
270 | * |
||||||
271 | * @return HtmlNode[]|HtmlNode |
||||||
272 | */ |
||||||
273 | public function find($selector, $idx = null, $lowercase = false) |
||||||
274 | { |
||||||
275 | return $this->root->find($selector, $idx, $lowercase); |
||||||
0 ignored issues
–
show
It seems like
$idx can also be of type double ; however, parameter $idx of simplehtmldom\HtmlNode::find() does only seem to accept integer , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
![]() |
|||||||
276 | } |
||||||
277 | |||||||
278 | public function title() |
||||||
279 | { |
||||||
280 | $title = $this->find('title', 0); |
||||||
281 | return $title ? $title->innertext : null; |
||||||
0 ignored issues
–
show
|
|||||||
282 | } |
||||||
283 | |||||||
284 | public function expect($selector, $idx = null, $lowercase = false) |
||||||
285 | { |
||||||
286 | return $this->root->expect($selector, $idx, $lowercase); |
||||||
287 | } |
||||||
288 | |||||||
289 | /** @codeCoverageIgnore */ |
||||||
290 | public function dump($show_attr = true) |
||||||
291 | { |
||||||
292 | $this->root->dump($show_attr); |
||||||
293 | } |
||||||
294 | |||||||
295 | protected function prepare( |
||||||
296 | $str, |
||||||
297 | $lowercase = true, |
||||||
298 | $defaultBRText = DEFAULT_BR_TEXT, |
||||||
299 | $defaultSpanText = DEFAULT_SPAN_TEXT |
||||||
300 | ) { |
||||||
301 | $this->clear(); |
||||||
0 ignored issues
–
show
The method
clear() does not exist on simplehtmldom\HtmlDocument . Since you implemented __call , consider adding a @method annotation.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
![]() |
|||||||
302 | |||||||
303 | $this->doc = trim($str); |
||||||
304 | $this->size = strlen($this->doc); |
||||||
305 | $this->original_size = $this->size; // original size of the html |
||||||
306 | $this->pos = 0; |
||||||
307 | $this->cursor = 1; |
||||||
308 | $this->noise = []; |
||||||
309 | $this->nodes = []; |
||||||
310 | $this->lowercase = $lowercase; |
||||||
311 | $this->default_br_text = $defaultBRText; |
||||||
312 | $this->default_span_text = $defaultSpanText; |
||||||
313 | $this->root = new HtmlNode($this); |
||||||
314 | $this->root->tag = 'root'; |
||||||
315 | $this->root->_[HtmlNode::HDOM_INFO_BEGIN] = -1; |
||||||
316 | $this->root->nodetype = HtmlNode::HDOM_TYPE_ROOT; |
||||||
317 | $this->parent = $this->root; |
||||||
318 | if ($this->size > 0) { |
||||||
319 | $this->char = $this->doc[0]; |
||||||
320 | } |
||||||
321 | } |
||||||
322 | |||||||
323 | protected function decode() |
||||||
324 | { |
||||||
325 | foreach ($this->nodes as $node) { |
||||||
326 | if (isset($node->_[HtmlNode::HDOM_INFO_TEXT])) { |
||||||
327 | $node->_[HtmlNode::HDOM_INFO_TEXT] = html_entity_decode( |
||||||
328 | $this->restore_noise($node->_[HtmlNode::HDOM_INFO_TEXT]), |
||||||
329 | ENT_QUOTES | ENT_HTML5, |
||||||
330 | $this->_target_charset |
||||||
331 | ); |
||||||
332 | } |
||||||
333 | if (isset($node->_[HtmlNode::HDOM_INFO_INNER])) { |
||||||
334 | $node->_[HtmlNode::HDOM_INFO_INNER] = html_entity_decode( |
||||||
335 | $this->restore_noise($node->_[HtmlNode::HDOM_INFO_INNER]), |
||||||
336 | ENT_QUOTES | ENT_HTML5, |
||||||
337 | $this->_target_charset |
||||||
338 | ); |
||||||
339 | } |
||||||
340 | if (isset($node->attr) && is_array($node->attr)) { |
||||||
341 | foreach ($node->attr as $a => $v) { |
||||||
342 | if (true === $v) { |
||||||
343 | continue; |
||||||
344 | } |
||||||
345 | $node->attr[$a] = html_entity_decode( |
||||||
346 | $v, |
||||||
347 | ENT_QUOTES | ENT_HTML5, |
||||||
348 | $this->_target_charset |
||||||
349 | ); |
||||||
350 | } |
||||||
351 | } |
||||||
352 | } |
||||||
353 | } |
||||||
354 | |||||||
355 | protected function parse($trim = false) |
||||||
356 | { |
||||||
357 | while (true) { |
||||||
358 | if ('<' !== $this->char) { |
||||||
359 | $content = $this->copy_until_char('<'); |
||||||
360 | |||||||
361 | if ('' !== $content) { |
||||||
362 | // Skip whitespace between tags? (</a> <b>) |
||||||
363 | if ($trim && '' === trim($content)) { |
||||||
364 | continue; |
||||||
365 | } |
||||||
366 | |||||||
367 | $node = new HtmlNode($this); |
||||||
368 | ++$this->cursor; |
||||||
369 | $node->_[HtmlNode::HDOM_INFO_TEXT] = $content; |
||||||
370 | $this->link_nodes($node, false); |
||||||
371 | } |
||||||
372 | } |
||||||
373 | |||||||
374 | if (false === $this->read_tag($trim)) { |
||||||
375 | break; |
||||||
376 | } |
||||||
377 | } |
||||||
378 | } |
||||||
379 | |||||||
380 | protected function parse_charset() |
||||||
381 | { |
||||||
382 | $charset = null; |
||||||
383 | |||||||
384 | if (function_exists('get_last_retrieve_url_contents_content_type')) { |
||||||
385 | $contentTypeHeader = call_user_func('get_last_retrieve_url_contents_content_type'); |
||||||
386 | $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); |
||||||
387 | if ($success) { |
||||||
388 | $charset = $matches[1]; |
||||||
389 | } |
||||||
390 | |||||||
391 | // phpcs:ignore Generic.Files.LineLength |
||||||
392 | Debug::log('Determining charset using get_last_retrieve_url_contents_content_type() ' . ($success ? 'successful' : 'failed')); |
||||||
393 | } |
||||||
394 | |||||||
395 | if (empty($charset)) { |
||||||
396 | // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type |
||||||
397 | $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true); |
||||||
398 | |||||||
399 | if (!empty($el)) { |
||||||
400 | $fullvalue = $el->content; |
||||||
0 ignored issues
–
show
The property
content does not exist on simplehtmldom\HtmlNode . Since you implemented __get , consider adding a @property annotation.
![]() |
|||||||
401 | |||||||
402 | if (!empty($fullvalue)) { |
||||||
403 | $success = preg_match( |
||||||
404 | '/charset=(.+)/i', |
||||||
405 | $fullvalue, |
||||||
406 | $matches |
||||||
407 | ); |
||||||
408 | |||||||
409 | if ($success) { |
||||||
410 | $charset = $matches[1]; |
||||||
411 | } |
||||||
412 | } |
||||||
413 | } |
||||||
414 | } |
||||||
415 | |||||||
416 | if (empty($charset)) { |
||||||
417 | // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration |
||||||
418 | if ($meta = $this->root->find('meta[charset]', 0)) { |
||||||
419 | $charset = $meta->charset; |
||||||
0 ignored issues
–
show
The property
charset does not exist on simplehtmldom\HtmlNode . Since you implemented __get , consider adding a @property annotation.
![]() |
|||||||
420 | } |
||||||
421 | } |
||||||
422 | |||||||
423 | if (empty($charset)) { |
||||||
424 | // Try to guess the charset based on the content |
||||||
425 | // Requires Multibyte String (mbstring) support (optional) |
||||||
426 | if (function_exists('mb_detect_encoding')) { |
||||||
427 | /** |
||||||
428 | * mb_detect_encoding() is not intended to distinguish between |
||||||
429 | * charsets, especially single-byte charsets. Its primary |
||||||
430 | * purpose is to detect which multibyte encoding is in use, |
||||||
431 | * i.e. UTF-8, UTF-16, shift-JIS, etc. |
||||||
432 | * |
||||||
433 | * -- https://bugs.php.net/bug.php?id=38138 |
||||||
434 | * |
||||||
435 | * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will |
||||||
436 | * always result in CP1251/ISO-8859-5 and vice versa. |
||||||
437 | * |
||||||
438 | * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1 |
||||||
439 | * to stay compatible. |
||||||
440 | */ |
||||||
441 | $encoding = mb_detect_encoding( |
||||||
442 | $this->doc, |
||||||
443 | ['UTF-8', 'CP1252', 'ISO-8859-1'] |
||||||
444 | ); |
||||||
445 | |||||||
446 | if ('CP1252' === $encoding || 'ISO-8859-1' === $encoding) { |
||||||
447 | // Due to a limitation of mb_detect_encoding |
||||||
448 | // 'CP1251'/'ISO-8859-5' will be detected as |
||||||
449 | // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in |
||||||
450 | // which case we can simply assume it is the other charset. |
||||||
451 | if (!@iconv('CP1252', 'UTF-8', $this->doc)) { |
||||||
452 | $encoding = 'CP1251'; |
||||||
453 | } |
||||||
454 | } |
||||||
455 | |||||||
456 | if (false !== $encoding) { |
||||||
0 ignored issues
–
show
|
|||||||
457 | $charset = $encoding; |
||||||
458 | } |
||||||
459 | } |
||||||
460 | } |
||||||
461 | |||||||
462 | if (empty($charset)) { |
||||||
463 | Debug::log('Unable to determine charset from source document. Assuming UTF-8'); |
||||||
464 | $charset = 'UTF-8'; |
||||||
465 | } |
||||||
466 | |||||||
467 | // Since CP1252 is a superset, if we get one of it's subsets, we want |
||||||
468 | // it instead. |
||||||
469 | if (('iso-8859-1' == strtolower($charset)) |
||||||
470 | || ('latin1' == strtolower($charset)) |
||||||
471 | || ('latin-1' == strtolower($charset)) |
||||||
472 | ) { |
||||||
473 | $charset = 'CP1252'; |
||||||
474 | } |
||||||
475 | |||||||
476 | return $this->_charset = $charset; |
||||||
477 | } |
||||||
478 | |||||||
479 | protected function read_tag($trim) |
||||||
480 | { |
||||||
481 | if ('<' !== $this->char) { // End Of File |
||||||
482 | $this->root->_[HtmlNode::HDOM_INFO_END] = $this->cursor; |
||||||
483 | |||||||
484 | // We might be in a nest of unclosed elements for which the end tags |
||||||
485 | // can be omitted. Close them for faster seek operations. |
||||||
486 | do { |
||||||
487 | if (isset($this->optional_closing_tags[strtolower($this->parent->tag)])) { |
||||||
488 | $this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor; |
||||||
489 | } |
||||||
490 | } while ($this->parent = $this->parent->parent); |
||||||
491 | |||||||
492 | return false; |
||||||
493 | } |
||||||
494 | |||||||
495 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||||||
496 | |||||||
497 | if ($trim) { // "< /html>" |
||||||
498 | $this->skip($this->token_blank); |
||||||
499 | } |
||||||
500 | |||||||
501 | // End tag: https://dev.w3.org/html5/pf-summary/syntax.html#end-tags |
||||||
502 | if ('/' === $this->char) { |
||||||
503 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||||||
504 | |||||||
505 | $tag = $this->copy_until_char('>'); |
||||||
506 | $tag = $trim ? ltrim($tag, $this->token_blank) : $tag; |
||||||
507 | |||||||
508 | // Skip attributes and whitespace in end tags |
||||||
509 | if ($trim && false !== ($pos = strpos($tag, ' '))) { |
||||||
510 | // phpcs:ignore Generic.Files.LineLength |
||||||
511 | Debug::log_once('Source document contains superfluous whitespace in end tags (</html >).'); |
||||||
512 | $tag = substr($tag, 0, $pos); |
||||||
513 | } |
||||||
514 | |||||||
515 | if (strcasecmp($this->parent->tag, $tag)) { // Parent is not start tag |
||||||
516 | $parent_lower = strtolower($this->parent->tag); |
||||||
517 | $tag_lower = strtolower($tag); |
||||||
518 | if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) { |
||||||
519 | $org_parent = $this->parent; |
||||||
520 | |||||||
521 | // Look for the start tag |
||||||
522 | while (($this->parent->parent) && strtolower($this->parent->tag) !== $tag_lower) { |
||||||
523 | // Close any unclosed element with optional end tags |
||||||
524 | if (isset($this->optional_closing_tags[strtolower($this->parent->tag)])) { |
||||||
525 | $this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor; |
||||||
526 | } |
||||||
527 | $this->parent = $this->parent->parent; |
||||||
528 | } |
||||||
529 | |||||||
530 | // No start tag, close grandparent |
||||||
531 | if (strtolower($this->parent->tag) !== $tag_lower) { |
||||||
532 | $this->parent = $org_parent; |
||||||
533 | |||||||
534 | if ($this->parent->parent) { |
||||||
535 | $this->parent = $this->parent->parent; |
||||||
536 | } |
||||||
537 | |||||||
538 | $this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor; |
||||||
539 | |||||||
540 | return $this->as_text_node($tag); |
||||||
541 | } |
||||||
542 | } elseif (($this->parent->parent) && isset($this->block_tags[$tag_lower])) { |
||||||
543 | // grandparent exists + current is block tag |
||||||
544 | // Parent has no end tag |
||||||
545 | $this->parent->_[HtmlNode::HDOM_INFO_END] = 0; |
||||||
546 | $org_parent = $this->parent; |
||||||
547 | |||||||
548 | // Find start tag |
||||||
549 | while (($this->parent->parent) && strtolower($this->parent->tag) !== $tag_lower) { |
||||||
550 | $this->parent = $this->parent->parent; |
||||||
551 | } |
||||||
552 | |||||||
553 | // No start tag, close parent |
||||||
554 | if (strtolower($this->parent->tag) !== $tag_lower) { |
||||||
555 | $this->parent = $org_parent; // restore origonal parent |
||||||
556 | $this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor; |
||||||
557 | |||||||
558 | return $this->as_text_node($tag); |
||||||
559 | } |
||||||
560 | } elseif (($this->parent->parent) && strtolower($this->parent->parent->tag) === $tag_lower) { |
||||||
561 | // Grandparent exists and current tag closes it |
||||||
562 | $this->parent->_[HtmlNode::HDOM_INFO_END] = 0; |
||||||
563 | $this->parent = $this->parent->parent; |
||||||
564 | } else { // Random tag, add as text node |
||||||
565 | return $this->as_text_node($tag); |
||||||
566 | } |
||||||
567 | } |
||||||
568 | |||||||
569 | // Link with start tag |
||||||
570 | $this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor; |
||||||
571 | |||||||
572 | if ($this->parent->parent) { |
||||||
573 | $this->parent = $this->parent->parent; |
||||||
574 | } |
||||||
575 | |||||||
576 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||||||
577 | return true; |
||||||
578 | } |
||||||
579 | |||||||
580 | // Start tag: https://dev.w3.org/html5/pf-summary/syntax.html#start-tags |
||||||
581 | $node = new HtmlNode($this); |
||||||
582 | $node->_[HtmlNode::HDOM_INFO_BEGIN] = $this->cursor++; |
||||||
583 | |||||||
584 | // Tag name |
||||||
585 | $tag = $this->copy_until($this->token_slash); |
||||||
586 | |||||||
587 | if (isset($tag[0]) && '!' === $tag[0]) { // Doctype, CData, Comment |
||||||
588 | if (isset($tag[2]) && '-' === $tag[1] && '-' === $tag[2]) { // Comment ("<!--") |
||||||
589 | // Go back until $tag only contains start of comment "!--". |
||||||
590 | while (strlen($tag) > 3) { |
||||||
591 | $this->char = $this->doc[--$this->pos]; // previous |
||||||
592 | $tag = substr($tag, 0, strlen($tag) - 1); |
||||||
593 | } |
||||||
594 | |||||||
595 | $node->nodetype = HtmlNode::HDOM_TYPE_COMMENT; |
||||||
596 | $node->tag = 'comment'; |
||||||
597 | |||||||
598 | $data = ''; |
||||||
599 | |||||||
600 | // There is a rare chance of empty comment: "<!---->" |
||||||
601 | // In which case the current char is the first "-" of the end tag |
||||||
602 | // But the comment could also just be a dash: "<!----->" |
||||||
603 | while (true) { |
||||||
604 | // Copy until first char of end tag |
||||||
605 | $data .= $this->copy_until_char('-'); |
||||||
606 | |||||||
607 | // Look ahead in the document, maybe we are at the end |
||||||
608 | if (($this->pos + 3) > $this->size) { // End of document |
||||||
609 | Debug::log('Source document ended unexpectedly!'); |
||||||
610 | break; |
||||||
611 | } elseif ('-->' === substr($this->doc, $this->pos, 3)) { // end |
||||||
612 | $data .= $this->copy_until_char('>'); |
||||||
613 | break; |
||||||
614 | } |
||||||
615 | |||||||
616 | $data .= $this->char; |
||||||
617 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||||||
618 | } |
||||||
619 | |||||||
620 | $tag .= $data; |
||||||
621 | $tag = $this->restore_noise($tag); |
||||||
622 | |||||||
623 | // Comment starts after "!--" and ends before "--" (5 chars total) |
||||||
624 | $node->_[HtmlNode::HDOM_INFO_INNER] = substr($tag, 3, strlen($tag) - 5); |
||||||
625 | } elseif ('[CDATA[' === substr($tag, 1, 7)) { |
||||||
626 | // Go back until $tag only contains start of cdata "![CDATA[". |
||||||
627 | while (strlen($tag) > 8) { |
||||||
628 | $this->char = $this->doc[--$this->pos]; // previous |
||||||
629 | $tag = substr($tag, 0, strlen($tag) - 1); |
||||||
630 | } |
||||||
631 | |||||||
632 | // CDATA can contain HTML stuff, need to find closing tags first |
||||||
633 | $node->nodetype = HtmlNode::HDOM_TYPE_CDATA; |
||||||
634 | $node->tag = 'cdata'; |
||||||
635 | |||||||
636 | $data = ''; |
||||||
637 | |||||||
638 | // There is a rare chance of empty CDATA: "<[CDATA[]]>" |
||||||
639 | // In which case the current char is the first "[" of the end tag |
||||||
640 | // But the CDATA could also just be a bracket: "<[CDATA[]]]>" |
||||||
641 | while (true) { |
||||||
642 | // Copy until first char of end tag |
||||||
643 | $data .= $this->copy_until_char(']'); |
||||||
644 | |||||||
645 | // Look ahead in the document, maybe we are at the end |
||||||
646 | if (($this->pos + 3) > $this->size) { // End of document |
||||||
647 | Debug::log('Source document ended unexpectedly!'); |
||||||
648 | break; |
||||||
649 | } elseif (']]>' === substr($this->doc, $this->pos, 3)) { // end |
||||||
650 | $data .= $this->copy_until_char('>'); |
||||||
651 | break; |
||||||
652 | } |
||||||
653 | |||||||
654 | $data .= $this->char; |
||||||
655 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||||||
656 | } |
||||||
657 | |||||||
658 | $tag .= $data; |
||||||
659 | $tag = $this->restore_noise($tag); |
||||||
660 | |||||||
661 | // CDATA starts after "![CDATA[" and ends before "]]" (10 chars total) |
||||||
662 | $node->_[HtmlNode::HDOM_INFO_INNER] = substr($tag, 8, strlen($tag) - 10); |
||||||
663 | } else { // Unknown |
||||||
664 | Debug::log('Source document contains unknown declaration: <' . $tag); |
||||||
665 | $node->nodetype = HtmlNode::HDOM_TYPE_UNKNOWN; |
||||||
666 | $node->tag = 'unknown'; |
||||||
667 | } |
||||||
668 | |||||||
669 | $node->_[HtmlNode::HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); |
||||||
670 | |||||||
671 | if ('>' === $this->char) { |
||||||
672 | $node->_[HtmlNode::HDOM_INFO_TEXT] .= '>'; |
||||||
673 | } |
||||||
674 | |||||||
675 | $this->link_nodes($node, true); |
||||||
676 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||||||
677 | return true; |
||||||
678 | } |
||||||
679 | |||||||
680 | if (!preg_match('/^\w[\w:-]*$/', $tag)) { // Invalid tag name |
||||||
681 | $node->_[HtmlNode::HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); |
||||||
682 | |||||||
683 | if ('>' === $this->char) { // End tag |
||||||
684 | $node->_[HtmlNode::HDOM_INFO_TEXT] .= '>'; |
||||||
685 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||||||
686 | } |
||||||
687 | |||||||
688 | $this->link_nodes($node, false); |
||||||
689 | Debug::log('Source document contains invalid tag name: ' . $node->_[HtmlNode::HDOM_INFO_TEXT]); |
||||||
690 | |||||||
691 | return true; |
||||||
692 | } |
||||||
693 | |||||||
694 | // Valid tag name |
||||||
695 | $node->nodetype = HtmlNode::HDOM_TYPE_ELEMENT; |
||||||
696 | $tag_lower = strtolower($tag); |
||||||
697 | $node->tag = ($this->lowercase) ? $tag_lower : $tag; |
||||||
698 | |||||||
699 | if (isset($this->optional_closing_tags[$tag_lower])) { // Optional closing tag |
||||||
700 | while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { |
||||||
701 | // Previous element was the last element of ancestor |
||||||
702 | $this->parent->_[HtmlNode::HDOM_INFO_END] = $node->_[HtmlNode::HDOM_INFO_BEGIN] - 1; |
||||||
703 | $this->parent = $this->parent->parent; |
||||||
704 | } |
||||||
705 | $node->parent = $this->parent; |
||||||
706 | } |
||||||
707 | |||||||
708 | $guard = 0; // prevent infinity loop |
||||||
709 | |||||||
710 | // [0] Space between tag and first attribute |
||||||
711 | $space = [$this->copy_skip($this->token_blank), '', '']; |
||||||
712 | |||||||
713 | do { // Parse attributes |
||||||
714 | $name = $this->copy_until($this->token_equal); |
||||||
715 | |||||||
716 | if ('' === $name && null !== $this->char && '' === $space[0]) { |
||||||
717 | break; |
||||||
718 | } |
||||||
719 | |||||||
720 | if ($guard === $this->pos) { // Escape infinite loop |
||||||
721 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||||||
722 | continue; |
||||||
723 | } |
||||||
724 | |||||||
725 | $guard = $this->pos; |
||||||
726 | |||||||
727 | if ($this->pos >= $this->size - 1 && '>' !== $this->char) { // End Of File |
||||||
728 | Debug::log('Source document ended unexpectedly!'); |
||||||
729 | $node->nodetype = HtmlNode::HDOM_TYPE_TEXT; |
||||||
730 | $node->_[HtmlNode::HDOM_INFO_END] = 0; |
||||||
731 | $node->_[HtmlNode::HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name; |
||||||
732 | $node->tag = 'text'; |
||||||
733 | $this->link_nodes($node, false); |
||||||
734 | |||||||
735 | return true; |
||||||
736 | } |
||||||
737 | |||||||
738 | if ('/' === $name || '' === $name) { // No more attributes |
||||||
739 | break; |
||||||
740 | } |
||||||
741 | |||||||
742 | // [1] Whitespace after attribute name |
||||||
743 | $space[1] = (false === strpos($this->token_blank, $this->char)) ? '' : $this->copy_skip($this->token_blank); |
||||||
744 | |||||||
745 | $name = $this->restore_noise($name); // might be a noisy name |
||||||
746 | |||||||
747 | if ($this->lowercase) { |
||||||
748 | $name = strtolower($name); |
||||||
749 | } |
||||||
750 | |||||||
751 | if ('=' === $this->char) { // Attribute with value |
||||||
752 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||||||
753 | $this->parse_attr($node, $name, $space, $trim); // get attribute value |
||||||
754 | } else { // Attribute without value |
||||||
755 | $node->_[HtmlNode::HDOM_INFO_QUOTE][$name] = HtmlNode::HDOM_QUOTE_NO; |
||||||
756 | $node->attr[$name] = true; |
||||||
757 | if ('>' !== $this->char) { |
||||||
758 | $this->char = $this->doc[--$this->pos]; |
||||||
759 | } // prev |
||||||
760 | } |
||||||
761 | |||||||
762 | // Space before attribute and around equal sign |
||||||
763 | if (!$trim && $space !== [' ', '', '']) { |
||||||
764 | // phpcs:ignore Generic.Files.LineLength |
||||||
765 | Debug::log_once('Source document contains superfluous whitespace in attributes (<e attribute = "value">). Enable trimming or fix attribute spacing for best performance.'); |
||||||
766 | $node->_[HtmlNode::HDOM_INFO_SPACE][$name] = $space; |
||||||
767 | } |
||||||
768 | |||||||
769 | // prepare for next attribute |
||||||
770 | $space = [ |
||||||
771 | ((false === strpos($this->token_blank, $this->char)) ? '' : $this->copy_skip($this->token_blank)), |
||||||
772 | '', |
||||||
773 | '', |
||||||
774 | ]; |
||||||
775 | } while ('>' !== $this->char && '/' !== $this->char); |
||||||
776 | |||||||
777 | $this->link_nodes($node, true); |
||||||
778 | |||||||
779 | // Space after last attribute before closing the tag |
||||||
780 | if (!$trim && '' !== $space[0]) { |
||||||
781 | // phpcs:ignore Generic.Files.LineLength |
||||||
782 | Debug::log_once('Source document contains superfluous whitespace before the closing braket (<e attribute="value" >). Enable trimming or remove spaces before closing brackets for best performance.'); |
||||||
783 | $node->_[HtmlNode::HDOM_INFO_ENDSPACE] = $space[0]; |
||||||
784 | } |
||||||
785 | |||||||
786 | $rest = ('>' === $this->char) ? '' : $this->copy_until_char('>'); |
||||||
787 | $rest = ($trim) ? trim($rest) : $rest; // <html / > |
||||||
788 | |||||||
789 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||||||
790 | |||||||
791 | if ('/' === trim($rest)) { // Void element |
||||||
792 | if ('' !== $rest) { |
||||||
793 | if (isset($node->_[HtmlNode::HDOM_INFO_ENDSPACE])) { |
||||||
794 | $node->_[HtmlNode::HDOM_INFO_ENDSPACE] .= $rest; |
||||||
795 | } else { |
||||||
796 | $node->_[HtmlNode::HDOM_INFO_ENDSPACE] = $rest; |
||||||
797 | } |
||||||
798 | } |
||||||
799 | $node->_[HtmlNode::HDOM_INFO_END] = 0; |
||||||
800 | } elseif (!isset($this->self_closing_tags[strtolower($node->tag)])) { |
||||||
801 | $innertext = $this->copy_until_char('<'); |
||||||
802 | if ('' !== $innertext) { |
||||||
803 | $node->_[HtmlNode::HDOM_INFO_INNER] = $innertext; |
||||||
804 | } |
||||||
805 | $this->parent = $node; |
||||||
806 | } |
||||||
807 | |||||||
808 | if ('br' === $node->tag) { |
||||||
809 | $node->_[HtmlNode::HDOM_INFO_INNER] = $this->default_br_text; |
||||||
810 | } elseif ('script' === $node->tag) { |
||||||
811 | $data = ''; |
||||||
812 | |||||||
813 | // There is a rare chance of empty script: "<script></script>" |
||||||
814 | // In which case the current char is the start of the end tag |
||||||
815 | // But the script could also just contain tags: "<script><div></script>" |
||||||
816 | while (true) { |
||||||
817 | // Copy until first char of end tag |
||||||
818 | $data .= $this->copy_until_char('<'); |
||||||
819 | |||||||
820 | // Look ahead in the document, maybe we are at the end |
||||||
821 | if (($this->pos + 9) > $this->size) { // End of document |
||||||
822 | Debug::log('Source document ended unexpectedly!'); |
||||||
823 | break; |
||||||
824 | } elseif ('</script' === substr($this->doc, $this->pos, 8)) { // end |
||||||
825 | $this->skip('>'); // don't include the end tag |
||||||
826 | break; |
||||||
827 | } |
||||||
828 | |||||||
829 | // Note: A script tag may contain any other tag except </script> |
||||||
830 | // which needs to be escaped as <\/script> |
||||||
831 | |||||||
832 | $data .= $this->char; |
||||||
833 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||||||
834 | } |
||||||
835 | |||||||
836 | $node = new HtmlNode($this); |
||||||
837 | ++$this->cursor; |
||||||
838 | $node->_[HtmlNode::HDOM_INFO_TEXT] = $data; |
||||||
839 | $this->link_nodes($node, false); |
||||||
840 | } |
||||||
841 | |||||||
842 | return true; |
||||||
843 | } |
||||||
844 | |||||||
845 | protected function parse_attr($node, $name, &$space, $trim) |
||||||
846 | { |
||||||
847 | $is_duplicate = isset($node->attr[$name]); |
||||||
848 | |||||||
849 | if (!$is_duplicate) { // Copy whitespace between "=" and value |
||||||
850 | $space[2] = (false === strpos($this->token_blank, $this->char)) ? '' : $this->copy_skip($this->token_blank); |
||||||
851 | } |
||||||
852 | |||||||
853 | switch ($this->char) { |
||||||
854 | case '"': |
||||||
855 | $quote_type = HtmlNode::HDOM_QUOTE_DOUBLE; |
||||||
856 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||||||
857 | $value = $this->copy_until_char('"'); |
||||||
858 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||||||
859 | break; |
||||||
860 | case '\'': |
||||||
861 | // phpcs:ignore Generic.Files.LineLength |
||||||
862 | Debug::log_once('Source document contains attribute values with single quotes (<e attribute=\'value\'>). Use double quotes for best performance.'); |
||||||
863 | $quote_type = HtmlNode::HDOM_QUOTE_SINGLE; |
||||||
864 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||||||
865 | $value = $this->copy_until_char('\''); |
||||||
866 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||||||
867 | break; |
||||||
868 | default: |
||||||
869 | // phpcs:ignore Generic.Files.LineLength |
||||||
870 | Debug::log_once('Source document contains attribute values without quotes (<e attribute=value>). Use double quotes for best performance'); |
||||||
871 | $quote_type = HtmlNode::HDOM_QUOTE_NO; |
||||||
872 | $value = $this->copy_until($this->token_attr); |
||||||
873 | } |
||||||
874 | |||||||
875 | $value = $this->restore_noise($value); |
||||||
876 | |||||||
877 | if ($trim) { |
||||||
878 | // Attribute values must not contain control characters other than space |
||||||
879 | // https://www.w3.org/TR/html/dom.html#text-content |
||||||
880 | // https://www.w3.org/TR/html/syntax.html#attribute-values |
||||||
881 | // https://www.w3.org/TR/xml/#AVNormalize |
||||||
882 | $value = preg_replace("/[\r\n\t\s]+/u", ' ', $value); |
||||||
883 | $value = trim($value); |
||||||
884 | } |
||||||
885 | |||||||
886 | if (!$is_duplicate) { |
||||||
887 | if (HtmlNode::HDOM_QUOTE_DOUBLE !== $quote_type) { |
||||||
888 | $node->_[HtmlNode::HDOM_INFO_QUOTE][$name] = $quote_type; |
||||||
889 | } |
||||||
890 | $node->attr[$name] = $value; |
||||||
891 | } |
||||||
892 | } |
||||||
893 | |||||||
894 | protected function link_nodes(&$node, $is_child) |
||||||
895 | { |
||||||
896 | $node->parent = $this->parent; |
||||||
897 | $this->parent->nodes[] = $node; |
||||||
898 | if ($is_child) { |
||||||
899 | $this->parent->children[] = $node; |
||||||
900 | } |
||||||
901 | } |
||||||
902 | |||||||
903 | protected function as_text_node($tag) |
||||||
904 | { |
||||||
905 | $node = new HtmlNode($this); |
||||||
906 | ++$this->cursor; |
||||||
907 | $node->_[HtmlNode::HDOM_INFO_TEXT] = '</' . $tag . '>'; |
||||||
908 | $this->link_nodes($node, false); |
||||||
909 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||||||
910 | return true; |
||||||
911 | } |
||||||
912 | |||||||
913 | protected function skip($chars) |
||||||
914 | { |
||||||
915 | $this->pos += strspn($this->doc, $chars, $this->pos); |
||||||
916 | $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||||||
917 | } |
||||||
918 | |||||||
919 | protected function copy_skip($chars) |
||||||
920 | { |
||||||
921 | $pos = $this->pos; |
||||||
922 | $len = strspn($this->doc, $chars, $pos); |
||||||
923 | if (0 === $len) { |
||||||
924 | return ''; |
||||||
925 | } |
||||||
926 | $this->pos += $len; |
||||||
927 | $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||||||
928 | return substr($this->doc, $pos, $len); |
||||||
929 | } |
||||||
930 | |||||||
931 | protected function copy_until($chars) |
||||||
932 | { |
||||||
933 | $pos = $this->pos; |
||||||
934 | $len = strcspn($this->doc, $chars, $pos); |
||||||
935 | $this->pos += $len; |
||||||
936 | $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||||||
937 | return substr($this->doc, $pos, $len); |
||||||
938 | } |
||||||
939 | |||||||
940 | protected function copy_until_char($char) |
||||||
941 | { |
||||||
942 | if (null === $this->char) { |
||||||
943 | return ''; |
||||||
944 | } |
||||||
945 | |||||||
946 | if (false === ($pos = strpos($this->doc, $char, $this->pos))) { |
||||||
947 | $ret = substr($this->doc, $this->pos, $this->size - $this->pos); |
||||||
948 | $this->char = null; |
||||||
949 | $this->pos = $this->size; |
||||||
950 | |||||||
951 | return $ret; |
||||||
952 | } |
||||||
953 | |||||||
954 | if ($pos === $this->pos) { |
||||||
955 | return ''; |
||||||
956 | } |
||||||
957 | |||||||
958 | $pos_old = $this->pos; |
||||||
959 | $this->char = $this->doc[$pos]; |
||||||
960 | $this->pos = $pos; |
||||||
961 | |||||||
962 | return substr($this->doc, $pos_old, $pos - $pos_old); |
||||||
963 | } |
||||||
964 | |||||||
965 | protected function remove_noise($pattern, $remove_tag = false) |
||||||
966 | { |
||||||
967 | $count = preg_match_all( |
||||||
968 | $pattern, |
||||||
969 | $this->doc, |
||||||
970 | $matches, |
||||||
971 | PREG_SET_ORDER | PREG_OFFSET_CAPTURE |
||||||
972 | ); |
||||||
973 | |||||||
974 | for ($i = $count - 1; $i > -1; --$i) { |
||||||
975 | $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000); |
||||||
976 | |||||||
977 | $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch |
||||||
978 | $this->noise[$key] = $matches[$i][$idx][0]; |
||||||
979 | $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); |
||||||
980 | } |
||||||
981 | |||||||
982 | // reset the length of content |
||||||
983 | $this->size = strlen($this->doc); |
||||||
984 | |||||||
985 | if ($this->size > 0) { |
||||||
986 | $this->char = $this->doc[0]; |
||||||
987 | } |
||||||
988 | } |
||||||
989 | |||||||
990 | public function restore_noise($text) |
||||||
991 | { |
||||||
992 | if (empty($this->noise)) { |
||||||
993 | return $text; |
||||||
994 | } // nothing to restore |
||||||
995 | $pos = 0; |
||||||
996 | while (false !== ($pos = strpos($text, '___noise___', $pos))) { |
||||||
997 | // Sometimes there is a broken piece of markup, and we don't GET the |
||||||
998 | // pos+11 etc... token which indicates a problem outside of us... |
||||||
999 | |||||||
1000 | // todo: "___noise___1000" (or any number with four or more digits) |
||||||
1001 | // in the DOM causes an infinite loop which could be utilized by |
||||||
1002 | // malicious software |
||||||
1003 | if (strlen($text) > $pos + 15) { |
||||||
1004 | $key = '___noise___' |
||||||
1005 | . $text[$pos + 11] |
||||||
1006 | . $text[$pos + 12] |
||||||
1007 | . $text[$pos + 13] |
||||||
1008 | . $text[$pos + 14] |
||||||
1009 | . $text[$pos + 15]; |
||||||
1010 | |||||||
1011 | if (isset($this->noise[$key])) { |
||||||
1012 | $text = substr($text, 0, $pos) |
||||||
1013 | . $this->noise[$key] |
||||||
1014 | . substr($text, $pos + 16); |
||||||
1015 | |||||||
1016 | unset($this->noise[$key]); |
||||||
1017 | } else { |
||||||
1018 | Debug::log_once('Noise restoration failed. DOM has been corrupted!'); |
||||||
1019 | // do this to prevent an infinite loop. |
||||||
1020 | // FIXME: THis causes an infinite loop because the keyword ___NOISE___ is included in the key! |
||||||
1021 | $text = substr($text, 0, $pos) |
||||||
1022 | . 'UNDEFINED NOISE FOR KEY: ' |
||||||
1023 | . $key |
||||||
1024 | . substr($text, $pos + 16); |
||||||
1025 | } |
||||||
1026 | } else { |
||||||
1027 | // There is no valid key being given back to us... We must get |
||||||
1028 | // rid of the ___noise___ or we will have a problem. |
||||||
1029 | Debug::log_once('Noise restoration failed. The provided key is incomplete: ' . $text); |
||||||
1030 | $text = substr($text, 0, $pos) |
||||||
1031 | . 'NO NUMERIC NOISE KEY' |
||||||
1032 | . substr($text, $pos + 11); |
||||||
1033 | } |
||||||
1034 | } |
||||||
1035 | |||||||
1036 | return $text; |
||||||
1037 | } |
||||||
1038 | |||||||
1039 | public function search_noise($text) |
||||||
1040 | { |
||||||
1041 | foreach ($this->noise as $noiseElement) { |
||||||
1042 | if (false !== strpos($noiseElement, $text)) { |
||||||
1043 | return $noiseElement; |
||||||
1044 | } |
||||||
1045 | } |
||||||
1046 | } |
||||||
1047 | |||||||
1048 | public function __toString() |
||||||
1049 | { |
||||||
1050 | return $this->root->innertext(); |
||||||
1051 | } |
||||||
1052 | |||||||
1053 | public function __get($name) |
||||||
1054 | { |
||||||
1055 | switch ($name) { |
||||||
1056 | case 'outertext': |
||||||
1057 | return $this->root->innertext(); |
||||||
1058 | case 'innertext': |
||||||
1059 | return $this->root->innertext(); |
||||||
1060 | case 'plaintext': |
||||||
1061 | return $this->root->text(); |
||||||
1062 | case 'charset': |
||||||
1063 | return $this->_charset; |
||||||
1064 | case 'target_charset': |
||||||
1065 | return $this->_target_charset; |
||||||
1066 | } |
||||||
1067 | } |
||||||
1068 | |||||||
1069 | public function childNodes($idx = -1) |
||||||
1070 | { |
||||||
1071 | return $this->root->childNodes($idx); |
||||||
1072 | } |
||||||
1073 | |||||||
1074 | public function firstChild() |
||||||
1075 | { |
||||||
1076 | return $this->root->firstChild(); |
||||||
1077 | } |
||||||
1078 | |||||||
1079 | public function lastChild() |
||||||
1080 | { |
||||||
1081 | return $this->root->lastChild(); |
||||||
1082 | } |
||||||
1083 | |||||||
1084 | public function createElement($name, $value = null) |
||||||
1085 | { |
||||||
1086 | $node = new HtmlNode(null); |
||||||
1087 | $node->nodetype = HtmlNode::HDOM_TYPE_ELEMENT; |
||||||
1088 | $node->_[HtmlNode::HDOM_INFO_BEGIN] = 1; |
||||||
1089 | $node->_[HtmlNode::HDOM_INFO_END] = 1; |
||||||
1090 | |||||||
1091 | if (null !== $value) { |
||||||
1092 | $node->_[HtmlNode::HDOM_INFO_INNER] = $value; |
||||||
1093 | } |
||||||
1094 | |||||||
1095 | $node->tag = $name; |
||||||
1096 | |||||||
1097 | return $node; |
||||||
1098 | } |
||||||
1099 | |||||||
1100 | public function createTextNode($value) |
||||||
1101 | { |
||||||
1102 | $node = new HtmlNode($this); |
||||||
1103 | $node->nodetype = HtmlNode::HDOM_TYPE_TEXT; |
||||||
1104 | |||||||
1105 | if (null !== $value) { |
||||||
1106 | $node->_[HtmlNode::HDOM_INFO_TEXT] = $value; |
||||||
1107 | } |
||||||
1108 | |||||||
1109 | return $node; |
||||||
1110 | } |
||||||
1111 | |||||||
1112 | public function getElementById($id) |
||||||
1113 | { |
||||||
1114 | return $this->find("#$id", 0); |
||||||
1115 | } |
||||||
1116 | |||||||
1117 | public function getElementsById($id, $idx = null) |
||||||
1118 | { |
||||||
1119 | return $this->find("#$id", $idx); |
||||||
1120 | } |
||||||
1121 | |||||||
1122 | public function getElementByTagName($name) |
||||||
1123 | { |
||||||
1124 | return $this->find($name, 0); |
||||||
1125 | } |
||||||
1126 | |||||||
1127 | public function getElementsByTagName($name, $idx = null) |
||||||
1128 | { |
||||||
1129 | return $this->find($name, $idx); |
||||||
1130 | } |
||||||
1131 | |||||||
1132 | public function loadFile($file) |
||||||
1133 | { |
||||||
1134 | $args = func_get_args(); |
||||||
1135 | |||||||
1136 | if (false !== ($doc = call_user_func_array('file_get_contents', $args))) { |
||||||
1137 | $this->load($doc, true); |
||||||
1138 | } else { |
||||||
1139 | return false; |
||||||
1140 | } |
||||||
1141 | } |
||||||
1142 | } |
||||||
1143 |