| Total Complexity | 209 |
| Total Lines | 1108 |
| Duplicated Lines | 0 % |
| Changes | 0 | ||
Complex classes like HtmlDocument often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use HtmlDocument, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 32 | class HtmlDocument |
||
| 33 | { |
||
| 34 | /** |
||
| 35 | * HtmlNode instance. |
||
| 36 | * |
||
| 37 | * @var HtmlNode |
||
| 38 | */ |
||
| 39 | public $root = null; |
||
| 40 | public $nodes = []; |
||
| 41 | public $callback = null; |
||
| 42 | public $lowercase = false; |
||
| 43 | public $original_size; |
||
| 44 | public $size; |
||
| 45 | |||
| 46 | protected $pos; |
||
| 47 | protected $doc; |
||
| 48 | protected $char; |
||
| 49 | |||
| 50 | protected $cursor; |
||
| 51 | protected $parent; |
||
| 52 | protected $noise = []; |
||
| 53 | protected $token_blank = " \t\r\n"; |
||
| 54 | protected $token_equal = ' =/>'; |
||
| 55 | protected $token_slash = " />\r\n\t"; |
||
| 56 | protected $token_attr = ' >'; |
||
| 57 | |||
| 58 | public $_charset = ''; |
||
| 59 | public $_target_charset = ''; |
||
| 60 | |||
| 61 | public $default_br_text = ''; |
||
| 62 | public $default_span_text = ''; |
||
| 63 | |||
| 64 | protected $self_closing_tags = [ |
||
| 65 | 'area' => 1, |
||
| 66 | 'base' => 1, |
||
| 67 | 'br' => 1, |
||
| 68 | 'col' => 1, |
||
| 69 | 'embed' => 1, |
||
| 70 | 'hr' => 1, |
||
| 71 | 'img' => 1, |
||
| 72 | 'input' => 1, |
||
| 73 | 'link' => 1, |
||
| 74 | 'meta' => 1, |
||
| 75 | 'param' => 1, |
||
| 76 | 'source' => 1, |
||
| 77 | 'track' => 1, |
||
| 78 | 'wbr' => 1, |
||
| 79 | ]; |
||
| 80 | protected $block_tags = [ |
||
| 81 | 'body' => 1, |
||
| 82 | 'div' => 1, |
||
| 83 | 'form' => 1, |
||
| 84 | 'root' => 1, |
||
| 85 | 'span' => 1, |
||
| 86 | 'table' => 1, |
||
| 87 | ]; |
||
| 88 | protected $optional_closing_tags = [ |
||
| 89 | // Not optional, see |
||
| 90 | // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element |
||
| 91 | 'b' => ['b' => 1], |
||
| 92 | 'dd' => ['dd' => 1, 'dt' => 1], |
||
| 93 | // Not optional, see |
||
| 94 | // https://www.w3.org/TR/html/grouping-content.html#the-dl-element |
||
| 95 | 'dl' => ['dd' => 1, 'dt' => 1], |
||
| 96 | 'dt' => ['dd' => 1, 'dt' => 1], |
||
| 97 | 'li' => ['li' => 1], |
||
| 98 | 'optgroup' => ['optgroup' => 1, 'option' => 1], |
||
| 99 | 'option' => ['optgroup' => 1, 'option' => 1], |
||
| 100 | 'p' => ['p' => 1], |
||
| 101 | 'rp' => ['rp' => 1, 'rt' => 1], |
||
| 102 | 'rt' => ['rp' => 1, 'rt' => 1], |
||
| 103 | 'td' => ['td' => 1, 'th' => 1], |
||
| 104 | 'th' => ['td' => 1, 'th' => 1], |
||
| 105 | 'tr' => ['td' => 1, 'th' => 1, 'tr' => 1], |
||
| 106 | ]; |
||
| 107 | |||
| 108 | public function __call($func, $args) |
||
| 109 | { |
||
| 110 | // Allow users to call methods with lower_case syntax |
||
| 111 | switch ($func) { |
||
| 112 | case 'load_file': |
||
| 113 | $actual_function = 'loadFile'; |
||
| 114 | break; |
||
| 115 | case 'clear': |
||
| 116 | return; /* no-op */ |
||
| 117 | default: |
||
| 118 | trigger_error( |
||
| 119 | 'Call to undefined method ' . __CLASS__ . '::' . $func . '()', |
||
| 120 | E_USER_ERROR |
||
| 121 | ); |
||
| 122 | } |
||
| 123 | |||
| 124 | // phpcs:ignore Generic.Files.LineLength |
||
| 125 | Debug::log(__CLASS__ . '->' . $func . '() has been deprecated and will be removed in the next major version of simplehtmldom. Use ' . __CLASS__ . '->' . $actual_function . '() instead.'); |
||
|
|
|||
| 126 | |||
| 127 | return call_user_func_array([$this, $actual_function], $args); |
||
| 128 | } |
||
| 129 | |||
| 130 | public function __construct( |
||
| 131 | $str = null, |
||
| 132 | $lowercase = true, |
||
| 133 | $forceTagsClosed = true, |
||
| 134 | $target_charset = DEFAULT_TARGET_CHARSET, |
||
| 135 | $stripRN = true, |
||
| 136 | $defaultBRText = DEFAULT_BR_TEXT, |
||
| 137 | $defaultSpanText = DEFAULT_SPAN_TEXT, |
||
| 138 | $options = 0 |
||
| 139 | ) { |
||
| 140 | if ($str) { |
||
| 141 | if (preg_match('/^http:\/\//i', $str) || is_file($str)) { |
||
| 142 | $this->load_file($str); |
||
| 143 | } else { |
||
| 144 | $this->load( |
||
| 145 | $str, |
||
| 146 | $lowercase, |
||
| 147 | $stripRN, |
||
| 148 | $defaultBRText, |
||
| 149 | $defaultSpanText, |
||
| 150 | $options |
||
| 151 | ); |
||
| 152 | } |
||
| 153 | } else { |
||
| 154 | $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText); |
||
| 155 | } |
||
| 156 | // Forcing tags to be closed implies that we don't trust the html, but |
||
| 157 | // it can lead to parsing errors if we SHOULD trust the html. |
||
| 158 | if (!$forceTagsClosed) { |
||
| 159 | $this->optional_closing_array = []; |
||
| 160 | } |
||
| 161 | |||
| 162 | $this->_target_charset = $target_charset; |
||
| 163 | } |
||
| 164 | |||
| 165 | public function __debugInfo() |
||
| 166 | { |
||
| 167 | return [ |
||
| 168 | 'root' => $this->root, |
||
| 169 | 'noise' => empty($this->noise) ? 'none' : $this->noise, |
||
| 170 | 'charset' => $this->_charset, |
||
| 171 | 'target charset' => $this->_target_charset, |
||
| 172 | 'original size' => $this->original_size, |
||
| 173 | ]; |
||
| 174 | } |
||
| 175 | |||
| 176 | public function __destruct() |
||
| 177 | { |
||
| 178 | if (isset($this->nodes)) { |
||
| 179 | foreach ($this->nodes as $n) { |
||
| 180 | $n->clear(); |
||
| 181 | } |
||
| 182 | } |
||
| 183 | } |
||
| 184 | |||
| 185 | public function load( |
||
| 186 | $str, |
||
| 187 | $lowercase = true, |
||
| 188 | $stripRN = true, |
||
| 189 | $defaultBRText = DEFAULT_BR_TEXT, |
||
| 190 | $defaultSpanText = DEFAULT_SPAN_TEXT, |
||
| 191 | $options = 0 |
||
| 192 | ) { |
||
| 193 | // prepare |
||
| 194 | $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText); |
||
| 195 | |||
| 196 | if ($stripRN) { |
||
| 197 | // Temporarily remove any element that shouldn't loose whitespace |
||
| 198 | $this->remove_noise("'<\s*script[^>]*>(.*?)<\s*/\s*script\s*>'is"); |
||
| 199 | $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is"); |
||
| 200 | $this->remove_noise("'<!--(.*?)-->'is"); |
||
| 201 | $this->remove_noise("'<\s*style[^>]*>(.*?)<\s*/\s*style\s*>'is"); |
||
| 202 | $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); |
||
| 203 | |||
| 204 | // Remove whitespace and newlines between tags |
||
| 205 | $this->doc = preg_replace('/\>([\t\s]*[\r\n]^[\t\s]*)\</m', '><', $this->doc); |
||
| 206 | |||
| 207 | // Remove whitespace and newlines in text |
||
| 208 | $this->doc = preg_replace('/([\t\s]*[\r\n]^[\t\s]*)/m', ' ', $this->doc); |
||
| 209 | |||
| 210 | // Restore temporarily removed elements and calculate new size |
||
| 211 | $this->doc = $this->restore_noise($this->doc); |
||
| 212 | $this->size = strlen($this->doc); |
||
| 213 | } |
||
| 214 | |||
| 215 | $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); // server-side script |
||
| 216 | if (count($this->noise)) { |
||
| 217 | // phpcs:ignore Generic.Files.LineLength |
||
| 218 | Debug::log('Support for server-side scripts has been deprecated and will be removed in the next major version of simplehtmldom.'); |
||
| 219 | } |
||
| 220 | |||
| 221 | if ($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts |
||
| 222 | $this->remove_noise("'(\{\w)(.*?)(\})'s", true); |
||
| 223 | // phpcs:ignore Generic.Files.LineLength |
||
| 224 | Debug::log('Support for Smarty scripts has been deprecated and will be removed in the next major version of simplehtmldom.'); |
||
| 225 | } |
||
| 226 | |||
| 227 | // parsing |
||
| 228 | $this->parse($stripRN); |
||
| 229 | // end |
||
| 230 | $this->root->_[HtmlNode::HDOM_INFO_END] = $this->cursor; |
||
| 231 | $this->parse_charset(); |
||
| 232 | $this->decode(); |
||
| 233 | unset($this->doc); |
||
| 234 | |||
| 235 | // make load function chainable |
||
| 236 | return $this; |
||
| 237 | } |
||
| 238 | |||
| 239 | public function set_callback($function_name) |
||
| 240 | { |
||
| 241 | $this->callback = $function_name; |
||
| 242 | } |
||
| 243 | |||
| 244 | public function remove_callback() |
||
| 245 | { |
||
| 246 | $this->callback = null; |
||
| 247 | } |
||
| 248 | |||
| 249 | /** |
||
| 250 | * Save modified html. |
||
| 251 | * |
||
| 252 | * @param string $filepath |
||
| 253 | */ |
||
| 254 | public function save($filepath = '') |
||
| 255 | { |
||
| 256 | $ret = $this->root->innertext(); |
||
| 257 | if ('' !== $filepath) { |
||
| 258 | file_put_contents($filepath, $ret, LOCK_EX); |
||
| 259 | } |
||
| 260 | |||
| 261 | return $ret; |
||
| 262 | } |
||
| 263 | |||
| 264 | /** |
||
| 265 | * Find elements by CSS Selector. |
||
| 266 | * |
||
| 267 | * @param string $selector CSS Selector |
||
| 268 | * @param number|null $idx |
||
| 269 | * @param bool $lowercase |
||
| 270 | * |
||
| 271 | * @return HtmlNode[]|HtmlNode |
||
| 272 | */ |
||
| 273 | public function find($selector, $idx = null, $lowercase = false) |
||
| 274 | { |
||
| 275 | return $this->root->find($selector, $idx, $lowercase); |
||
| 276 | } |
||
| 277 | |||
| 278 | public function title() |
||
| 279 | { |
||
| 280 | $title = $this->find('title', 0); |
||
| 281 | return $title ? $title->innertext : null; |
||
| 282 | } |
||
| 283 | |||
| 284 | public function expect($selector, $idx = null, $lowercase = false) |
||
| 285 | { |
||
| 286 | return $this->root->expect($selector, $idx, $lowercase); |
||
| 287 | } |
||
| 288 | |||
| 289 | /** @codeCoverageIgnore */ |
||
| 290 | public function dump($show_attr = true) |
||
| 291 | { |
||
| 292 | $this->root->dump($show_attr); |
||
| 293 | } |
||
| 294 | |||
| 295 | protected function prepare( |
||
| 296 | $str, |
||
| 297 | $lowercase = true, |
||
| 298 | $defaultBRText = DEFAULT_BR_TEXT, |
||
| 299 | $defaultSpanText = DEFAULT_SPAN_TEXT |
||
| 300 | ) { |
||
| 301 | $this->clear(); |
||
| 302 | |||
| 303 | $this->doc = trim($str); |
||
| 304 | $this->size = strlen($this->doc); |
||
| 305 | $this->original_size = $this->size; // original size of the html |
||
| 306 | $this->pos = 0; |
||
| 307 | $this->cursor = 1; |
||
| 308 | $this->noise = []; |
||
| 309 | $this->nodes = []; |
||
| 310 | $this->lowercase = $lowercase; |
||
| 311 | $this->default_br_text = $defaultBRText; |
||
| 312 | $this->default_span_text = $defaultSpanText; |
||
| 313 | $this->root = new HtmlNode($this); |
||
| 314 | $this->root->tag = 'root'; |
||
| 315 | $this->root->_[HtmlNode::HDOM_INFO_BEGIN] = -1; |
||
| 316 | $this->root->nodetype = HtmlNode::HDOM_TYPE_ROOT; |
||
| 317 | $this->parent = $this->root; |
||
| 318 | if ($this->size > 0) { |
||
| 319 | $this->char = $this->doc[0]; |
||
| 320 | } |
||
| 321 | } |
||
| 322 | |||
| 323 | protected function decode() |
||
| 324 | { |
||
| 325 | foreach ($this->nodes as $node) { |
||
| 326 | if (isset($node->_[HtmlNode::HDOM_INFO_TEXT])) { |
||
| 327 | $node->_[HtmlNode::HDOM_INFO_TEXT] = html_entity_decode( |
||
| 328 | $this->restore_noise($node->_[HtmlNode::HDOM_INFO_TEXT]), |
||
| 329 | ENT_QUOTES | ENT_HTML5, |
||
| 330 | $this->_target_charset |
||
| 331 | ); |
||
| 332 | } |
||
| 333 | if (isset($node->_[HtmlNode::HDOM_INFO_INNER])) { |
||
| 334 | $node->_[HtmlNode::HDOM_INFO_INNER] = html_entity_decode( |
||
| 335 | $this->restore_noise($node->_[HtmlNode::HDOM_INFO_INNER]), |
||
| 336 | ENT_QUOTES | ENT_HTML5, |
||
| 337 | $this->_target_charset |
||
| 338 | ); |
||
| 339 | } |
||
| 340 | if (isset($node->attr) && is_array($node->attr)) { |
||
| 341 | foreach ($node->attr as $a => $v) { |
||
| 342 | if (true === $v) { |
||
| 343 | continue; |
||
| 344 | } |
||
| 345 | $node->attr[$a] = html_entity_decode( |
||
| 346 | $v, |
||
| 347 | ENT_QUOTES | ENT_HTML5, |
||
| 348 | $this->_target_charset |
||
| 349 | ); |
||
| 350 | } |
||
| 351 | } |
||
| 352 | } |
||
| 353 | } |
||
| 354 | |||
| 355 | protected function parse($trim = false) |
||
| 376 | } |
||
| 377 | } |
||
| 378 | } |
||
| 379 | |||
| 380 | protected function parse_charset() |
||
| 381 | { |
||
| 382 | $charset = null; |
||
| 383 | |||
| 384 | if (function_exists('get_last_retrieve_url_contents_content_type')) { |
||
| 385 | $contentTypeHeader = call_user_func('get_last_retrieve_url_contents_content_type'); |
||
| 386 | $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); |
||
| 387 | if ($success) { |
||
| 388 | $charset = $matches[1]; |
||
| 389 | } |
||
| 390 | |||
| 391 | // phpcs:ignore Generic.Files.LineLength |
||
| 392 | Debug::log('Determining charset using get_last_retrieve_url_contents_content_type() ' . ($success ? 'successful' : 'failed')); |
||
| 393 | } |
||
| 394 | |||
| 395 | if (empty($charset)) { |
||
| 396 | // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type |
||
| 397 | $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true); |
||
| 398 | |||
| 399 | if (!empty($el)) { |
||
| 400 | $fullvalue = $el->content; |
||
| 401 | |||
| 402 | if (!empty($fullvalue)) { |
||
| 403 | $success = preg_match( |
||
| 404 | '/charset=(.+)/i', |
||
| 405 | $fullvalue, |
||
| 406 | $matches |
||
| 407 | ); |
||
| 408 | |||
| 409 | if ($success) { |
||
| 410 | $charset = $matches[1]; |
||
| 411 | } |
||
| 412 | } |
||
| 413 | } |
||
| 414 | } |
||
| 415 | |||
| 416 | if (empty($charset)) { |
||
| 417 | // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration |
||
| 418 | if ($meta = $this->root->find('meta[charset]', 0)) { |
||
| 419 | $charset = $meta->charset; |
||
| 420 | } |
||
| 421 | } |
||
| 422 | |||
| 423 | if (empty($charset)) { |
||
| 424 | // Try to guess the charset based on the content |
||
| 425 | // Requires Multibyte String (mbstring) support (optional) |
||
| 426 | if (function_exists('mb_detect_encoding')) { |
||
| 427 | /** |
||
| 428 | * mb_detect_encoding() is not intended to distinguish between |
||
| 429 | * charsets, especially single-byte charsets. Its primary |
||
| 430 | * purpose is to detect which multibyte encoding is in use, |
||
| 431 | * i.e. UTF-8, UTF-16, shift-JIS, etc. |
||
| 432 | * |
||
| 433 | * -- https://bugs.php.net/bug.php?id=38138 |
||
| 434 | * |
||
| 435 | * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will |
||
| 436 | * always result in CP1251/ISO-8859-5 and vice versa. |
||
| 437 | * |
||
| 438 | * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1 |
||
| 439 | * to stay compatible. |
||
| 440 | */ |
||
| 441 | $encoding = mb_detect_encoding( |
||
| 442 | $this->doc, |
||
| 443 | ['UTF-8', 'CP1252', 'ISO-8859-1'] |
||
| 444 | ); |
||
| 445 | |||
| 446 | if ('CP1252' === $encoding || 'ISO-8859-1' === $encoding) { |
||
| 447 | // Due to a limitation of mb_detect_encoding |
||
| 448 | // 'CP1251'/'ISO-8859-5' will be detected as |
||
| 449 | // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in |
||
| 450 | // which case we can simply assume it is the other charset. |
||
| 451 | if (!@iconv('CP1252', 'UTF-8', $this->doc)) { |
||
| 452 | $encoding = 'CP1251'; |
||
| 453 | } |
||
| 454 | } |
||
| 455 | |||
| 456 | if (false !== $encoding) { |
||
| 457 | $charset = $encoding; |
||
| 458 | } |
||
| 459 | } |
||
| 460 | } |
||
| 461 | |||
| 462 | if (empty($charset)) { |
||
| 463 | Debug::log('Unable to determine charset from source document. Assuming UTF-8'); |
||
| 464 | $charset = 'UTF-8'; |
||
| 465 | } |
||
| 466 | |||
| 467 | // Since CP1252 is a superset, if we get one of it's subsets, we want |
||
| 468 | // it instead. |
||
| 469 | if (('iso-8859-1' == strtolower($charset)) |
||
| 470 | || ('latin1' == strtolower($charset)) |
||
| 471 | || ('latin-1' == strtolower($charset)) |
||
| 472 | ) { |
||
| 473 | $charset = 'CP1252'; |
||
| 474 | } |
||
| 475 | |||
| 476 | return $this->_charset = $charset; |
||
| 477 | } |
||
| 478 | |||
| 479 | protected function read_tag($trim) |
||
| 480 | { |
||
| 481 | if ('<' !== $this->char) { // End Of File |
||
| 482 | $this->root->_[HtmlNode::HDOM_INFO_END] = $this->cursor; |
||
| 483 | |||
| 484 | // We might be in a nest of unclosed elements for which the end tags |
||
| 485 | // can be omitted. Close them for faster seek operations. |
||
| 486 | do { |
||
| 487 | if (isset($this->optional_closing_tags[strtolower($this->parent->tag)])) { |
||
| 488 | $this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor; |
||
| 489 | } |
||
| 490 | } while ($this->parent = $this->parent->parent); |
||
| 491 | |||
| 492 | return false; |
||
| 493 | } |
||
| 494 | |||
| 495 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 496 | |||
| 497 | if ($trim) { // "< /html>" |
||
| 498 | $this->skip($this->token_blank); |
||
| 499 | } |
||
| 500 | |||
| 501 | // End tag: https://dev.w3.org/html5/pf-summary/syntax.html#end-tags |
||
| 502 | if ('/' === $this->char) { |
||
| 503 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 504 | |||
| 505 | $tag = $this->copy_until_char('>'); |
||
| 506 | $tag = $trim ? ltrim($tag, $this->token_blank) : $tag; |
||
| 507 | |||
| 508 | // Skip attributes and whitespace in end tags |
||
| 509 | if ($trim && false !== ($pos = strpos($tag, ' '))) { |
||
| 510 | // phpcs:ignore Generic.Files.LineLength |
||
| 511 | Debug::log_once('Source document contains superfluous whitespace in end tags (</html >).'); |
||
| 512 | $tag = substr($tag, 0, $pos); |
||
| 513 | } |
||
| 514 | |||
| 515 | if (strcasecmp($this->parent->tag, $tag)) { // Parent is not start tag |
||
| 516 | $parent_lower = strtolower($this->parent->tag); |
||
| 517 | $tag_lower = strtolower($tag); |
||
| 518 | if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) { |
||
| 519 | $org_parent = $this->parent; |
||
| 520 | |||
| 521 | // Look for the start tag |
||
| 522 | while (($this->parent->parent) && strtolower($this->parent->tag) !== $tag_lower) { |
||
| 523 | // Close any unclosed element with optional end tags |
||
| 524 | if (isset($this->optional_closing_tags[strtolower($this->parent->tag)])) { |
||
| 525 | $this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor; |
||
| 526 | } |
||
| 527 | $this->parent = $this->parent->parent; |
||
| 528 | } |
||
| 529 | |||
| 530 | // No start tag, close grandparent |
||
| 531 | if (strtolower($this->parent->tag) !== $tag_lower) { |
||
| 532 | $this->parent = $org_parent; |
||
| 533 | |||
| 534 | if ($this->parent->parent) { |
||
| 535 | $this->parent = $this->parent->parent; |
||
| 536 | } |
||
| 537 | |||
| 538 | $this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor; |
||
| 539 | |||
| 540 | return $this->as_text_node($tag); |
||
| 541 | } |
||
| 542 | } elseif (($this->parent->parent) && isset($this->block_tags[$tag_lower])) { |
||
| 543 | // grandparent exists + current is block tag |
||
| 544 | // Parent has no end tag |
||
| 545 | $this->parent->_[HtmlNode::HDOM_INFO_END] = 0; |
||
| 546 | $org_parent = $this->parent; |
||
| 547 | |||
| 548 | // Find start tag |
||
| 549 | while (($this->parent->parent) && strtolower($this->parent->tag) !== $tag_lower) { |
||
| 550 | $this->parent = $this->parent->parent; |
||
| 551 | } |
||
| 552 | |||
| 553 | // No start tag, close parent |
||
| 554 | if (strtolower($this->parent->tag) !== $tag_lower) { |
||
| 555 | $this->parent = $org_parent; // restore origonal parent |
||
| 556 | $this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor; |
||
| 557 | |||
| 558 | return $this->as_text_node($tag); |
||
| 559 | } |
||
| 560 | } elseif (($this->parent->parent) && strtolower($this->parent->parent->tag) === $tag_lower) { |
||
| 561 | // Grandparent exists and current tag closes it |
||
| 562 | $this->parent->_[HtmlNode::HDOM_INFO_END] = 0; |
||
| 563 | $this->parent = $this->parent->parent; |
||
| 564 | } else { // Random tag, add as text node |
||
| 565 | return $this->as_text_node($tag); |
||
| 566 | } |
||
| 567 | } |
||
| 568 | |||
| 569 | // Link with start tag |
||
| 570 | $this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor; |
||
| 571 | |||
| 572 | if ($this->parent->parent) { |
||
| 573 | $this->parent = $this->parent->parent; |
||
| 574 | } |
||
| 575 | |||
| 576 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 577 | return true; |
||
| 578 | } |
||
| 579 | |||
| 580 | // Start tag: https://dev.w3.org/html5/pf-summary/syntax.html#start-tags |
||
| 581 | $node = new HtmlNode($this); |
||
| 582 | $node->_[HtmlNode::HDOM_INFO_BEGIN] = $this->cursor++; |
||
| 583 | |||
| 584 | // Tag name |
||
| 585 | $tag = $this->copy_until($this->token_slash); |
||
| 586 | |||
| 587 | if (isset($tag[0]) && '!' === $tag[0]) { // Doctype, CData, Comment |
||
| 588 | if (isset($tag[2]) && '-' === $tag[1] && '-' === $tag[2]) { // Comment ("<!--") |
||
| 589 | // Go back until $tag only contains start of comment "!--". |
||
| 590 | while (strlen($tag) > 3) { |
||
| 591 | $this->char = $this->doc[--$this->pos]; // previous |
||
| 592 | $tag = substr($tag, 0, strlen($tag) - 1); |
||
| 593 | } |
||
| 594 | |||
| 595 | $node->nodetype = HtmlNode::HDOM_TYPE_COMMENT; |
||
| 596 | $node->tag = 'comment'; |
||
| 597 | |||
| 598 | $data = ''; |
||
| 599 | |||
| 600 | // There is a rare chance of empty comment: "<!---->" |
||
| 601 | // In which case the current char is the first "-" of the end tag |
||
| 602 | // But the comment could also just be a dash: "<!----->" |
||
| 603 | while (true) { |
||
| 604 | // Copy until first char of end tag |
||
| 605 | $data .= $this->copy_until_char('-'); |
||
| 606 | |||
| 607 | // Look ahead in the document, maybe we are at the end |
||
| 608 | if (($this->pos + 3) > $this->size) { // End of document |
||
| 609 | Debug::log('Source document ended unexpectedly!'); |
||
| 610 | break; |
||
| 611 | } elseif ('-->' === substr($this->doc, $this->pos, 3)) { // end |
||
| 612 | $data .= $this->copy_until_char('>'); |
||
| 613 | break; |
||
| 614 | } |
||
| 615 | |||
| 616 | $data .= $this->char; |
||
| 617 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 618 | } |
||
| 619 | |||
| 620 | $tag .= $data; |
||
| 621 | $tag = $this->restore_noise($tag); |
||
| 622 | |||
| 623 | // Comment starts after "!--" and ends before "--" (5 chars total) |
||
| 624 | $node->_[HtmlNode::HDOM_INFO_INNER] = substr($tag, 3, strlen($tag) - 5); |
||
| 625 | } elseif ('[CDATA[' === substr($tag, 1, 7)) { |
||
| 626 | // Go back until $tag only contains start of cdata "![CDATA[". |
||
| 627 | while (strlen($tag) > 8) { |
||
| 628 | $this->char = $this->doc[--$this->pos]; // previous |
||
| 629 | $tag = substr($tag, 0, strlen($tag) - 1); |
||
| 630 | } |
||
| 631 | |||
| 632 | // CDATA can contain HTML stuff, need to find closing tags first |
||
| 633 | $node->nodetype = HtmlNode::HDOM_TYPE_CDATA; |
||
| 634 | $node->tag = 'cdata'; |
||
| 635 | |||
| 636 | $data = ''; |
||
| 637 | |||
| 638 | // There is a rare chance of empty CDATA: "<[CDATA[]]>" |
||
| 639 | // In which case the current char is the first "[" of the end tag |
||
| 640 | // But the CDATA could also just be a bracket: "<[CDATA[]]]>" |
||
| 641 | while (true) { |
||
| 642 | // Copy until first char of end tag |
||
| 643 | $data .= $this->copy_until_char(']'); |
||
| 644 | |||
| 645 | // Look ahead in the document, maybe we are at the end |
||
| 646 | if (($this->pos + 3) > $this->size) { // End of document |
||
| 647 | Debug::log('Source document ended unexpectedly!'); |
||
| 648 | break; |
||
| 649 | } elseif (']]>' === substr($this->doc, $this->pos, 3)) { // end |
||
| 650 | $data .= $this->copy_until_char('>'); |
||
| 651 | break; |
||
| 652 | } |
||
| 653 | |||
| 654 | $data .= $this->char; |
||
| 655 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 656 | } |
||
| 657 | |||
| 658 | $tag .= $data; |
||
| 659 | $tag = $this->restore_noise($tag); |
||
| 660 | |||
| 661 | // CDATA starts after "![CDATA[" and ends before "]]" (10 chars total) |
||
| 662 | $node->_[HtmlNode::HDOM_INFO_INNER] = substr($tag, 8, strlen($tag) - 10); |
||
| 663 | } else { // Unknown |
||
| 664 | Debug::log('Source document contains unknown declaration: <' . $tag); |
||
| 665 | $node->nodetype = HtmlNode::HDOM_TYPE_UNKNOWN; |
||
| 666 | $node->tag = 'unknown'; |
||
| 667 | } |
||
| 668 | |||
| 669 | $node->_[HtmlNode::HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); |
||
| 670 | |||
| 671 | if ('>' === $this->char) { |
||
| 672 | $node->_[HtmlNode::HDOM_INFO_TEXT] .= '>'; |
||
| 673 | } |
||
| 674 | |||
| 675 | $this->link_nodes($node, true); |
||
| 676 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 677 | return true; |
||
| 678 | } |
||
| 679 | |||
| 680 | if (!preg_match('/^\w[\w:-]*$/', $tag)) { // Invalid tag name |
||
| 681 | $node->_[HtmlNode::HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); |
||
| 682 | |||
| 683 | if ('>' === $this->char) { // End tag |
||
| 684 | $node->_[HtmlNode::HDOM_INFO_TEXT] .= '>'; |
||
| 685 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 686 | } |
||
| 687 | |||
| 688 | $this->link_nodes($node, false); |
||
| 689 | Debug::log('Source document contains invalid tag name: ' . $node->_[HtmlNode::HDOM_INFO_TEXT]); |
||
| 690 | |||
| 691 | return true; |
||
| 692 | } |
||
| 693 | |||
| 694 | // Valid tag name |
||
| 695 | $node->nodetype = HtmlNode::HDOM_TYPE_ELEMENT; |
||
| 696 | $tag_lower = strtolower($tag); |
||
| 697 | $node->tag = ($this->lowercase) ? $tag_lower : $tag; |
||
| 698 | |||
| 699 | if (isset($this->optional_closing_tags[$tag_lower])) { // Optional closing tag |
||
| 700 | while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { |
||
| 701 | // Previous element was the last element of ancestor |
||
| 702 | $this->parent->_[HtmlNode::HDOM_INFO_END] = $node->_[HtmlNode::HDOM_INFO_BEGIN] - 1; |
||
| 703 | $this->parent = $this->parent->parent; |
||
| 704 | } |
||
| 705 | $node->parent = $this->parent; |
||
| 706 | } |
||
| 707 | |||
| 708 | $guard = 0; // prevent infinity loop |
||
| 709 | |||
| 710 | // [0] Space between tag and first attribute |
||
| 711 | $space = [$this->copy_skip($this->token_blank), '', '']; |
||
| 712 | |||
| 713 | do { // Parse attributes |
||
| 714 | $name = $this->copy_until($this->token_equal); |
||
| 715 | |||
| 716 | if ('' === $name && null !== $this->char && '' === $space[0]) { |
||
| 717 | break; |
||
| 718 | } |
||
| 719 | |||
| 720 | if ($guard === $this->pos) { // Escape infinite loop |
||
| 721 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 722 | continue; |
||
| 723 | } |
||
| 724 | |||
| 725 | $guard = $this->pos; |
||
| 726 | |||
| 727 | if ($this->pos >= $this->size - 1 && '>' !== $this->char) { // End Of File |
||
| 728 | Debug::log('Source document ended unexpectedly!'); |
||
| 729 | $node->nodetype = HtmlNode::HDOM_TYPE_TEXT; |
||
| 730 | $node->_[HtmlNode::HDOM_INFO_END] = 0; |
||
| 731 | $node->_[HtmlNode::HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name; |
||
| 732 | $node->tag = 'text'; |
||
| 733 | $this->link_nodes($node, false); |
||
| 734 | |||
| 735 | return true; |
||
| 736 | } |
||
| 737 | |||
| 738 | if ('/' === $name || '' === $name) { // No more attributes |
||
| 739 | break; |
||
| 740 | } |
||
| 741 | |||
| 742 | // [1] Whitespace after attribute name |
||
| 743 | $space[1] = (false === strpos($this->token_blank, $this->char)) ? '' : $this->copy_skip($this->token_blank); |
||
| 744 | |||
| 745 | $name = $this->restore_noise($name); // might be a noisy name |
||
| 746 | |||
| 747 | if ($this->lowercase) { |
||
| 748 | $name = strtolower($name); |
||
| 749 | } |
||
| 750 | |||
| 751 | if ('=' === $this->char) { // Attribute with value |
||
| 752 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 753 | $this->parse_attr($node, $name, $space, $trim); // get attribute value |
||
| 754 | } else { // Attribute without value |
||
| 755 | $node->_[HtmlNode::HDOM_INFO_QUOTE][$name] = HtmlNode::HDOM_QUOTE_NO; |
||
| 756 | $node->attr[$name] = true; |
||
| 757 | if ('>' !== $this->char) { |
||
| 758 | $this->char = $this->doc[--$this->pos]; |
||
| 759 | } // prev |
||
| 760 | } |
||
| 761 | |||
| 762 | // Space before attribute and around equal sign |
||
| 763 | if (!$trim && $space !== [' ', '', '']) { |
||
| 764 | // phpcs:ignore Generic.Files.LineLength |
||
| 765 | Debug::log_once('Source document contains superfluous whitespace in attributes (<e attribute = "value">). Enable trimming or fix attribute spacing for best performance.'); |
||
| 766 | $node->_[HtmlNode::HDOM_INFO_SPACE][$name] = $space; |
||
| 767 | } |
||
| 768 | |||
| 769 | // prepare for next attribute |
||
| 770 | $space = [ |
||
| 771 | ((false === strpos($this->token_blank, $this->char)) ? '' : $this->copy_skip($this->token_blank)), |
||
| 772 | '', |
||
| 773 | '', |
||
| 774 | ]; |
||
| 775 | } while ('>' !== $this->char && '/' !== $this->char); |
||
| 776 | |||
| 777 | $this->link_nodes($node, true); |
||
| 778 | |||
| 779 | // Space after last attribute before closing the tag |
||
| 780 | if (!$trim && '' !== $space[0]) { |
||
| 781 | // phpcs:ignore Generic.Files.LineLength |
||
| 782 | Debug::log_once('Source document contains superfluous whitespace before the closing braket (<e attribute="value" >). Enable trimming or remove spaces before closing brackets for best performance.'); |
||
| 783 | $node->_[HtmlNode::HDOM_INFO_ENDSPACE] = $space[0]; |
||
| 784 | } |
||
| 785 | |||
| 786 | $rest = ('>' === $this->char) ? '' : $this->copy_until_char('>'); |
||
| 787 | $rest = ($trim) ? trim($rest) : $rest; // <html / > |
||
| 788 | |||
| 789 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 790 | |||
| 791 | if ('/' === trim($rest)) { // Void element |
||
| 792 | if ('' !== $rest) { |
||
| 793 | if (isset($node->_[HtmlNode::HDOM_INFO_ENDSPACE])) { |
||
| 794 | $node->_[HtmlNode::HDOM_INFO_ENDSPACE] .= $rest; |
||
| 795 | } else { |
||
| 796 | $node->_[HtmlNode::HDOM_INFO_ENDSPACE] = $rest; |
||
| 797 | } |
||
| 798 | } |
||
| 799 | $node->_[HtmlNode::HDOM_INFO_END] = 0; |
||
| 800 | } elseif (!isset($this->self_closing_tags[strtolower($node->tag)])) { |
||
| 801 | $innertext = $this->copy_until_char('<'); |
||
| 802 | if ('' !== $innertext) { |
||
| 803 | $node->_[HtmlNode::HDOM_INFO_INNER] = $innertext; |
||
| 804 | } |
||
| 805 | $this->parent = $node; |
||
| 806 | } |
||
| 807 | |||
| 808 | if ('br' === $node->tag) { |
||
| 809 | $node->_[HtmlNode::HDOM_INFO_INNER] = $this->default_br_text; |
||
| 810 | } elseif ('script' === $node->tag) { |
||
| 811 | $data = ''; |
||
| 812 | |||
| 813 | // There is a rare chance of empty script: "<script></script>" |
||
| 814 | // In which case the current char is the start of the end tag |
||
| 815 | // But the script could also just contain tags: "<script><div></script>" |
||
| 816 | while (true) { |
||
| 817 | // Copy until first char of end tag |
||
| 818 | $data .= $this->copy_until_char('<'); |
||
| 819 | |||
| 820 | // Look ahead in the document, maybe we are at the end |
||
| 821 | if (($this->pos + 9) > $this->size) { // End of document |
||
| 822 | Debug::log('Source document ended unexpectedly!'); |
||
| 823 | break; |
||
| 824 | } elseif ('</script' === substr($this->doc, $this->pos, 8)) { // end |
||
| 825 | $this->skip('>'); // don't include the end tag |
||
| 826 | break; |
||
| 827 | } |
||
| 828 | |||
| 829 | // Note: A script tag may contain any other tag except </script> |
||
| 830 | // which needs to be escaped as <\/script> |
||
| 831 | |||
| 832 | $data .= $this->char; |
||
| 833 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 834 | } |
||
| 835 | |||
| 836 | $node = new HtmlNode($this); |
||
| 837 | ++$this->cursor; |
||
| 838 | $node->_[HtmlNode::HDOM_INFO_TEXT] = $data; |
||
| 839 | $this->link_nodes($node, false); |
||
| 840 | } |
||
| 841 | |||
| 842 | return true; |
||
| 843 | } |
||
| 844 | |||
| 845 | protected function parse_attr($node, $name, &$space, $trim) |
||
| 846 | { |
||
| 847 | $is_duplicate = isset($node->attr[$name]); |
||
| 848 | |||
| 849 | if (!$is_duplicate) { // Copy whitespace between "=" and value |
||
| 850 | $space[2] = (false === strpos($this->token_blank, $this->char)) ? '' : $this->copy_skip($this->token_blank); |
||
| 851 | } |
||
| 852 | |||
| 853 | switch ($this->char) { |
||
| 854 | case '"': |
||
| 855 | $quote_type = HtmlNode::HDOM_QUOTE_DOUBLE; |
||
| 856 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 857 | $value = $this->copy_until_char('"'); |
||
| 858 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 859 | break; |
||
| 860 | case '\'': |
||
| 861 | // phpcs:ignore Generic.Files.LineLength |
||
| 862 | Debug::log_once('Source document contains attribute values with single quotes (<e attribute=\'value\'>). Use double quotes for best performance.'); |
||
| 863 | $quote_type = HtmlNode::HDOM_QUOTE_SINGLE; |
||
| 864 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 865 | $value = $this->copy_until_char('\''); |
||
| 866 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 867 | break; |
||
| 868 | default: |
||
| 869 | // phpcs:ignore Generic.Files.LineLength |
||
| 870 | Debug::log_once('Source document contains attribute values without quotes (<e attribute=value>). Use double quotes for best performance'); |
||
| 871 | $quote_type = HtmlNode::HDOM_QUOTE_NO; |
||
| 872 | $value = $this->copy_until($this->token_attr); |
||
| 873 | } |
||
| 874 | |||
| 875 | $value = $this->restore_noise($value); |
||
| 876 | |||
| 877 | if ($trim) { |
||
| 878 | // Attribute values must not contain control characters other than space |
||
| 879 | // https://www.w3.org/TR/html/dom.html#text-content |
||
| 880 | // https://www.w3.org/TR/html/syntax.html#attribute-values |
||
| 881 | // https://www.w3.org/TR/xml/#AVNormalize |
||
| 882 | $value = preg_replace("/[\r\n\t\s]+/u", ' ', $value); |
||
| 883 | $value = trim($value); |
||
| 884 | } |
||
| 885 | |||
| 886 | if (!$is_duplicate) { |
||
| 887 | if (HtmlNode::HDOM_QUOTE_DOUBLE !== $quote_type) { |
||
| 888 | $node->_[HtmlNode::HDOM_INFO_QUOTE][$name] = $quote_type; |
||
| 889 | } |
||
| 890 | $node->attr[$name] = $value; |
||
| 891 | } |
||
| 892 | } |
||
| 893 | |||
| 894 | protected function link_nodes(&$node, $is_child) |
||
| 895 | { |
||
| 896 | $node->parent = $this->parent; |
||
| 897 | $this->parent->nodes[] = $node; |
||
| 898 | if ($is_child) { |
||
| 899 | $this->parent->children[] = $node; |
||
| 900 | } |
||
| 901 | } |
||
| 902 | |||
| 903 | protected function as_text_node($tag) |
||
| 904 | { |
||
| 905 | $node = new HtmlNode($this); |
||
| 906 | ++$this->cursor; |
||
| 907 | $node->_[HtmlNode::HDOM_INFO_TEXT] = '</' . $tag . '>'; |
||
| 908 | $this->link_nodes($node, false); |
||
| 909 | $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 910 | return true; |
||
| 911 | } |
||
| 912 | |||
| 913 | protected function skip($chars) |
||
| 914 | { |
||
| 915 | $this->pos += strspn($this->doc, $chars, $this->pos); |
||
| 916 | $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 917 | } |
||
| 918 | |||
| 919 | protected function copy_skip($chars) |
||
| 920 | { |
||
| 921 | $pos = $this->pos; |
||
| 922 | $len = strspn($this->doc, $chars, $pos); |
||
| 923 | if (0 === $len) { |
||
| 924 | return ''; |
||
| 925 | } |
||
| 926 | $this->pos += $len; |
||
| 927 | $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 928 | return substr($this->doc, $pos, $len); |
||
| 929 | } |
||
| 930 | |||
| 931 | protected function copy_until($chars) |
||
| 932 | { |
||
| 933 | $pos = $this->pos; |
||
| 934 | $len = strcspn($this->doc, $chars, $pos); |
||
| 935 | $this->pos += $len; |
||
| 936 | $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next |
||
| 937 | return substr($this->doc, $pos, $len); |
||
| 938 | } |
||
| 939 | |||
| 940 | protected function copy_until_char($char) |
||
| 941 | { |
||
| 942 | if (null === $this->char) { |
||
| 943 | return ''; |
||
| 944 | } |
||
| 945 | |||
| 946 | if (false === ($pos = strpos($this->doc, $char, $this->pos))) { |
||
| 947 | $ret = substr($this->doc, $this->pos, $this->size - $this->pos); |
||
| 948 | $this->char = null; |
||
| 949 | $this->pos = $this->size; |
||
| 950 | |||
| 951 | return $ret; |
||
| 952 | } |
||
| 953 | |||
| 954 | if ($pos === $this->pos) { |
||
| 955 | return ''; |
||
| 956 | } |
||
| 957 | |||
| 958 | $pos_old = $this->pos; |
||
| 959 | $this->char = $this->doc[$pos]; |
||
| 960 | $this->pos = $pos; |
||
| 961 | |||
| 962 | return substr($this->doc, $pos_old, $pos - $pos_old); |
||
| 963 | } |
||
| 964 | |||
| 965 | protected function remove_noise($pattern, $remove_tag = false) |
||
| 966 | { |
||
| 967 | $count = preg_match_all( |
||
| 968 | $pattern, |
||
| 969 | $this->doc, |
||
| 970 | $matches, |
||
| 971 | PREG_SET_ORDER | PREG_OFFSET_CAPTURE |
||
| 972 | ); |
||
| 973 | |||
| 974 | for ($i = $count - 1; $i > -1; --$i) { |
||
| 975 | $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000); |
||
| 976 | |||
| 977 | $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch |
||
| 978 | $this->noise[$key] = $matches[$i][$idx][0]; |
||
| 979 | $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); |
||
| 980 | } |
||
| 981 | |||
| 982 | // reset the length of content |
||
| 983 | $this->size = strlen($this->doc); |
||
| 984 | |||
| 985 | if ($this->size > 0) { |
||
| 986 | $this->char = $this->doc[0]; |
||
| 987 | } |
||
| 988 | } |
||
| 989 | |||
| 990 | public function restore_noise($text) |
||
| 991 | { |
||
| 992 | if (empty($this->noise)) { |
||
| 993 | return $text; |
||
| 994 | } // nothing to restore |
||
| 995 | $pos = 0; |
||
| 996 | while (false !== ($pos = strpos($text, '___noise___', $pos))) { |
||
| 997 | // Sometimes there is a broken piece of markup, and we don't GET the |
||
| 998 | // pos+11 etc... token which indicates a problem outside of us... |
||
| 999 | |||
| 1000 | // todo: "___noise___1000" (or any number with four or more digits) |
||
| 1001 | // in the DOM causes an infinite loop which could be utilized by |
||
| 1002 | // malicious software |
||
| 1003 | if (strlen($text) > $pos + 15) { |
||
| 1004 | $key = '___noise___' |
||
| 1005 | . $text[$pos + 11] |
||
| 1006 | . $text[$pos + 12] |
||
| 1007 | . $text[$pos + 13] |
||
| 1008 | . $text[$pos + 14] |
||
| 1009 | . $text[$pos + 15]; |
||
| 1010 | |||
| 1011 | if (isset($this->noise[$key])) { |
||
| 1012 | $text = substr($text, 0, $pos) |
||
| 1013 | . $this->noise[$key] |
||
| 1014 | . substr($text, $pos + 16); |
||
| 1015 | |||
| 1016 | unset($this->noise[$key]); |
||
| 1017 | } else { |
||
| 1018 | Debug::log_once('Noise restoration failed. DOM has been corrupted!'); |
||
| 1019 | // do this to prevent an infinite loop. |
||
| 1020 | // FIXME: THis causes an infinite loop because the keyword ___NOISE___ is included in the key! |
||
| 1021 | $text = substr($text, 0, $pos) |
||
| 1022 | . 'UNDEFINED NOISE FOR KEY: ' |
||
| 1023 | . $key |
||
| 1024 | . substr($text, $pos + 16); |
||
| 1025 | } |
||
| 1026 | } else { |
||
| 1027 | // There is no valid key being given back to us... We must get |
||
| 1028 | // rid of the ___noise___ or we will have a problem. |
||
| 1029 | Debug::log_once('Noise restoration failed. The provided key is incomplete: ' . $text); |
||
| 1030 | $text = substr($text, 0, $pos) |
||
| 1031 | . 'NO NUMERIC NOISE KEY' |
||
| 1032 | . substr($text, $pos + 11); |
||
| 1033 | } |
||
| 1034 | } |
||
| 1035 | |||
| 1036 | return $text; |
||
| 1037 | } |
||
| 1038 | |||
| 1039 | public function search_noise($text) |
||
| 1040 | { |
||
| 1041 | foreach ($this->noise as $noiseElement) { |
||
| 1042 | if (false !== strpos($noiseElement, $text)) { |
||
| 1043 | return $noiseElement; |
||
| 1044 | } |
||
| 1045 | } |
||
| 1046 | } |
||
| 1047 | |||
| 1048 | public function __toString() |
||
| 1049 | { |
||
| 1050 | return $this->root->innertext(); |
||
| 1051 | } |
||
| 1052 | |||
| 1053 | public function __get($name) |
||
| 1054 | { |
||
| 1055 | switch ($name) { |
||
| 1056 | case 'outertext': |
||
| 1057 | return $this->root->innertext(); |
||
| 1058 | case 'innertext': |
||
| 1059 | return $this->root->innertext(); |
||
| 1060 | case 'plaintext': |
||
| 1061 | return $this->root->text(); |
||
| 1062 | case 'charset': |
||
| 1063 | return $this->_charset; |
||
| 1064 | case 'target_charset': |
||
| 1065 | return $this->_target_charset; |
||
| 1066 | } |
||
| 1067 | } |
||
| 1068 | |||
| 1069 | public function childNodes($idx = -1) |
||
| 1070 | { |
||
| 1071 | return $this->root->childNodes($idx); |
||
| 1072 | } |
||
| 1073 | |||
| 1074 | public function firstChild() |
||
| 1077 | } |
||
| 1078 | |||
| 1079 | public function lastChild() |
||
| 1080 | { |
||
| 1081 | return $this->root->lastChild(); |
||
| 1082 | } |
||
| 1083 | |||
| 1084 | public function createElement($name, $value = null) |
||
| 1085 | { |
||
| 1086 | $node = new HtmlNode(null); |
||
| 1087 | $node->nodetype = HtmlNode::HDOM_TYPE_ELEMENT; |
||
| 1088 | $node->_[HtmlNode::HDOM_INFO_BEGIN] = 1; |
||
| 1089 | $node->_[HtmlNode::HDOM_INFO_END] = 1; |
||
| 1090 | |||
| 1091 | if (null !== $value) { |
||
| 1092 | $node->_[HtmlNode::HDOM_INFO_INNER] = $value; |
||
| 1093 | } |
||
| 1094 | |||
| 1095 | $node->tag = $name; |
||
| 1096 | |||
| 1097 | return $node; |
||
| 1098 | } |
||
| 1099 | |||
| 1100 | public function createTextNode($value) |
||
| 1101 | { |
||
| 1102 | $node = new HtmlNode($this); |
||
| 1103 | $node->nodetype = HtmlNode::HDOM_TYPE_TEXT; |
||
| 1104 | |||
| 1105 | if (null !== $value) { |
||
| 1106 | $node->_[HtmlNode::HDOM_INFO_TEXT] = $value; |
||
| 1107 | } |
||
| 1108 | |||
| 1109 | return $node; |
||
| 1110 | } |
||
| 1111 | |||
| 1112 | public function getElementById($id) |
||
| 1113 | { |
||
| 1114 | return $this->find("#$id", 0); |
||
| 1115 | } |
||
| 1116 | |||
| 1117 | public function getElementsById($id, $idx = null) |
||
| 1120 | } |
||
| 1121 | |||
| 1122 | public function getElementByTagName($name) |
||
| 1123 | { |
||
| 1124 | return $this->find($name, 0); |
||
| 1125 | } |
||
| 1126 | |||
| 1127 | public function getElementsByTagName($name, $idx = null) |
||
| 1130 | } |
||
| 1131 | |||
| 1132 | public function loadFile($file) |
||
| 1133 | { |
||
| 1134 | $args = func_get_args(); |
||
| 1135 | |||
| 1136 | if (false !== ($doc = call_user_func_array('file_get_contents', $args))) { |
||
| 1137 | $this->load($doc, true); |
||
| 1138 | } else { |
||
| 1139 | return false; |
||
| 1140 | } |
||
| 1141 | } |
||
| 1142 | } |
||
| 1143 |