| Total Complexity | 244 | 
| Total Lines | 948 | 
| Duplicated Lines | 0 % | 
| Changes | 0 | ||
Complex classes like simple_html_dom_node often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use simple_html_dom_node, and based on these observations, apply Extract Interface, too.
| 1 | <?php | ||
| 116 | class simple_html_dom_node | ||
| 117 | { | ||
| 118 | public $nodetype = HDOM_TYPE_TEXT; | ||
| 119 | public $tag = 'text'; | ||
| 120 | public $attr = []; | ||
| 121 | public $children = []; | ||
| 122 | public $nodes = []; | ||
| 123 | public $parent = null; | ||
| 124 | // The "info" array - see HDOM_INFO_... for what each element contains. | ||
| 125 | public $_ = []; | ||
| 126 | public $tag_start = 0; | ||
| 127 | private $dom = null; | ||
| 128 | |||
| 129 | public function __construct($dom) | ||
| 130 |     { | ||
| 131 | $this->dom = $dom; | ||
| 132 | $dom->nodes[] = $this; | ||
| 133 | } | ||
| 134 | |||
| 135 | public function __destruct() | ||
| 136 |     { | ||
| 137 | $this->clear(); | ||
| 138 | } | ||
| 139 | |||
| 140 | public function __toString() | ||
| 141 |     { | ||
| 142 | return $this->outertext(); | ||
| 143 | } | ||
| 144 | |||
| 145 | // clean up memory due to php5 circular references memory leak... | ||
| 146 | public function clear() | ||
| 147 |     { | ||
| 148 | $this->dom = null; | ||
| 149 | $this->nodes = null; | ||
| 150 | $this->parent = null; | ||
| 151 | $this->children = null; | ||
| 152 | } | ||
| 153 | |||
| 154 | // dump node's tree | ||
| 155 | public function dump($show_attr=true, $deep=0) | ||
| 156 |     { | ||
| 157 |         $lead = str_repeat('	', $deep); | ||
| 158 | |||
| 159 | echo $lead.$this->tag; | ||
| 160 |         if ($show_attr && count($this->attr)>0) { | ||
| 161 |             echo '('; | ||
| 162 |             foreach ($this->attr as $k=>$v) { | ||
| 163 | echo "[$k]=>\"".$this->$k.'", '; | ||
| 164 | } | ||
| 165 | echo ')'; | ||
| 166 | } | ||
| 167 | echo "\n"; | ||
| 168 | |||
| 169 |         if ($this->nodes) { | ||
| 170 |             foreach ($this->nodes as $c) { | ||
| 171 | $c->dump($show_attr, $deep+1); | ||
| 172 | } | ||
| 173 | } | ||
| 174 | } | ||
| 175 | |||
| 176 | |||
| 177 | // Debugging function to dump a single dom node with a bunch of information about it. | ||
| 178 | public function dump_node($echo=true) | ||
| 179 |     { | ||
| 180 | $string = $this->tag; | ||
| 181 |         if (count($this->attr)>0) { | ||
| 182 |             $string .= '('; | ||
| 183 |             foreach ($this->attr as $k=>$v) { | ||
| 184 | $string .= "[$k]=>\"".$this->$k.'", '; | ||
| 185 | } | ||
| 186 | $string .= ')'; | ||
| 187 | } | ||
| 188 |         if (count($this->_)>0) { | ||
| 189 |             $string .= ' $_ ('; | ||
| 190 |             foreach ($this->_ as $k=>$v) { | ||
| 191 |                 if (is_array($v)) { | ||
| 192 |                     $string .= "[$k]=>("; | ||
| 193 |                     foreach ($v as $k2=>$v2) { | ||
| 194 | $string .= "[$k2]=>\"".$v2.'", '; | ||
| 195 | } | ||
| 196 | $string .= ")"; | ||
| 197 |                 } else { | ||
| 198 | $string .= "[$k]=>\"".$v.'", '; | ||
| 199 | } | ||
| 200 | } | ||
| 201 | $string .= ")"; | ||
| 202 | } | ||
| 203 | |||
| 204 |         if (isset($this->text)) { | ||
| 205 |             $string .= " text: (" . $this->text . ")"; | ||
| 206 | } | ||
| 207 | |||
| 208 | $string .= " HDOM_INNER_INFO: '"; | ||
| 209 |         if (isset($node->_[HDOM_INFO_INNER])) { | ||
| 210 | $string .= $node->_[HDOM_INFO_INNER] . "'"; | ||
| 211 |         } else { | ||
| 212 | $string .= ' NULL '; | ||
| 213 | } | ||
| 214 | |||
| 215 | $string .= " children: " . count($this->children); | ||
| 216 | $string .= " nodes: " . count($this->nodes); | ||
| 217 | $string .= " tag_start: " . $this->tag_start; | ||
| 218 | $string .= "\n"; | ||
| 219 | |||
| 220 |         if ($echo) { | ||
| 221 | echo $string; | ||
| 222 | return; | ||
| 223 |         } else { | ||
| 224 | return $string; | ||
| 225 | } | ||
| 226 | } | ||
| 227 | |||
| 228 | // returns the parent of node | ||
| 229 | // If a node is passed in, it will reset the parent of the current node to that one. | ||
| 230 | public function parent($parent=null) | ||
| 231 |     { | ||
| 232 | // I am SURE that this doesn't work properly. | ||
| 233 | // It fails to unset the current node from it's current parents nodes or children list first. | ||
| 234 |         if ($parent !== null) { | ||
| 235 | $this->parent = $parent; | ||
| 236 | $this->parent->nodes[] = $this; | ||
| 237 | $this->parent->children[] = $this; | ||
| 238 | } | ||
| 239 | |||
| 240 | return $this->parent; | ||
| 241 | } | ||
| 242 | |||
| 243 | // verify that node has children | ||
| 244 | public function has_child() | ||
| 245 |     { | ||
| 246 | return !empty($this->children); | ||
| 247 | } | ||
| 248 | |||
| 249 | // returns children of node | ||
| 250 | public function children($idx=-1) | ||
| 251 |     { | ||
| 252 |         if ($idx===-1) { | ||
| 253 | return $this->children; | ||
| 254 | } | ||
| 255 |         if (isset($this->children[$idx])) { | ||
| 256 | return $this->children[$idx]; | ||
| 257 | } | ||
| 258 | return null; | ||
| 259 | } | ||
| 260 | |||
| 261 | // returns the first child of node | ||
| 262 | public function first_child() | ||
| 263 |     { | ||
| 264 |         if (count($this->children)>0) { | ||
| 265 | return $this->children[0]; | ||
| 266 | } | ||
| 267 | return null; | ||
| 268 | } | ||
| 269 | |||
| 270 | // returns the last child of node | ||
| 271 | public function last_child() | ||
| 272 |     { | ||
| 273 |         if (($count=count($this->children))>0) { | ||
| 274 | return $this->children[$count-1]; | ||
| 275 | } | ||
| 276 | return null; | ||
| 277 | } | ||
| 278 | |||
| 279 | // returns the next sibling of node | ||
| 280 | public function next_sibling() | ||
| 281 |     { | ||
| 282 |         if ($this->parent===null) { | ||
| 283 | return null; | ||
| 284 | } | ||
| 285 | |||
| 286 | $idx = 0; | ||
| 287 | $count = count($this->parent->children); | ||
| 288 |         while ($idx<$count && $this!==$this->parent->children[$idx]) { | ||
| 289 | ++$idx; | ||
| 290 | } | ||
| 291 |         if (++$idx>=$count) { | ||
| 292 | return null; | ||
| 293 | } | ||
| 294 | return $this->parent->children[$idx]; | ||
| 295 | } | ||
| 296 | |||
| 297 | // returns the previous sibling of node | ||
| 298 | public function prev_sibling() | ||
| 299 |     { | ||
| 300 |         if ($this->parent===null) { | ||
| 301 | return null; | ||
| 302 | } | ||
| 303 | $idx = 0; | ||
| 304 | $count = count($this->parent->children); | ||
| 305 |         while ($idx<$count && $this!==$this->parent->children[$idx]) { | ||
| 306 | ++$idx; | ||
| 307 | } | ||
| 308 |         if (--$idx<0) { | ||
| 309 | return null; | ||
| 310 | } | ||
| 311 | return $this->parent->children[$idx]; | ||
| 312 | } | ||
| 313 | |||
| 314 | // function to locate a specific ancestor tag in the path to the root. | ||
| 315 | public function find_ancestor_tag($tag) | ||
| 316 |     { | ||
| 317 | global $debug_object; | ||
| 318 |         if (is_object($debug_object)) { | ||
| 319 | $debug_object->debugLogEntry(1); | ||
| 320 | } | ||
| 321 | |||
| 322 | // Start by including ourselves in the comparison. | ||
| 323 | $returnDom = $this; | ||
| 324 | |||
| 325 |         while (!is_null($returnDom)) { | ||
| 326 |             if (is_object($debug_object)) { | ||
| 327 | $debug_object->debugLog(2, "Current tag is: " . $returnDom->tag); | ||
| 328 | } | ||
| 329 | |||
| 330 |             if ($returnDom->tag == $tag) { | ||
| 331 | break; | ||
| 332 | } | ||
| 333 | $returnDom = $returnDom->parent; | ||
| 334 | } | ||
| 335 | return $returnDom; | ||
| 336 | } | ||
| 337 | |||
| 338 | // get dom node's inner html | ||
| 339 | public function innertext() | ||
| 340 |     { | ||
| 341 |         if (isset($this->_[HDOM_INFO_INNER])) { | ||
| 342 | return $this->_[HDOM_INFO_INNER]; | ||
| 343 | } | ||
| 344 |         if (isset($this->_[HDOM_INFO_TEXT])) { | ||
| 345 | return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); | ||
| 346 | } | ||
| 347 | |||
| 348 | $ret = ''; | ||
| 349 |         foreach ($this->nodes as $n) { | ||
| 350 | $ret .= $n->outertext(); | ||
| 351 | } | ||
| 352 | return $ret; | ||
| 353 | } | ||
| 354 | |||
| 355 | // get dom node's outer text (with tag) | ||
| 356 | public function outertext() | ||
| 357 |     { | ||
| 358 | global $debug_object; | ||
| 359 |         if (is_object($debug_object)) { | ||
| 360 | $text = ''; | ||
| 361 |             if ($this->tag == 'text') { | ||
| 362 |                 if (!empty($this->text)) { | ||
| 363 | $text = " with text: " . $this->text; | ||
| 364 | } | ||
| 365 | } | ||
| 366 | $debug_object->debugLog(1, 'Innertext of tag: ' . $this->tag . $text); | ||
| 367 | } | ||
| 368 | |||
| 369 |         if ($this->tag==='root') { | ||
| 370 | return $this->innertext(); | ||
| 371 | } | ||
| 372 | |||
| 373 | // trigger callback | ||
| 374 |         if ($this->dom && $this->dom->callback!==null) { | ||
| 375 | call_user_func_array($this->dom->callback, array($this)); | ||
| 376 | } | ||
| 377 | |||
| 378 |         if (isset($this->_[HDOM_INFO_OUTER])) { | ||
| 379 | return $this->_[HDOM_INFO_OUTER]; | ||
| 380 | } | ||
| 381 |         if (isset($this->_[HDOM_INFO_TEXT])) { | ||
| 382 | return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); | ||
| 383 | } | ||
| 384 | |||
| 385 | // render begin tag | ||
| 386 |         if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) { | ||
| 387 | $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); | ||
| 388 |         } else { | ||
| 389 | $ret = ""; | ||
| 390 | } | ||
| 391 | |||
| 392 | // render inner text | ||
| 393 |         if (isset($this->_[HDOM_INFO_INNER])) { | ||
| 394 | // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added. | ||
| 395 |             if ($this->tag != "br") { | ||
| 396 | $ret .= $this->_[HDOM_INFO_INNER]; | ||
| 397 | } | ||
| 398 |         } else { | ||
| 399 |             if ($this->nodes) { | ||
| 400 |                 foreach ($this->nodes as $n) { | ||
| 401 | $ret .= $this->convert_text($n->outertext()); | ||
| 402 | } | ||
| 403 | } | ||
| 404 | } | ||
| 405 | |||
| 406 | // render end tag | ||
| 407 |         if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0) { | ||
| 408 | $ret .= '</'.$this->tag.'>'; | ||
| 409 | } | ||
| 410 | return $ret; | ||
| 411 | } | ||
| 412 | |||
| 413 | // get dom node's plain text | ||
| 414 | public function text() | ||
| 415 |     { | ||
| 416 |         if (isset($this->_[HDOM_INFO_INNER])) { | ||
| 417 | return $this->_[HDOM_INFO_INNER]; | ||
| 418 | } | ||
| 419 |         switch ($this->nodetype) { | ||
| 420 | case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); | ||
| 421 | case HDOM_TYPE_COMMENT: return ''; | ||
| 422 | case HDOM_TYPE_UNKNOWN: return ''; | ||
| 423 | } | ||
| 424 |         if (strcasecmp($this->tag, 'script')===0) { | ||
| 425 | return ''; | ||
| 426 | } | ||
| 427 |         if (strcasecmp($this->tag, 'style')===0) { | ||
| 428 | return ''; | ||
| 429 | } | ||
| 430 | |||
| 431 | $ret = ''; | ||
| 432 | // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL. | ||
| 433 | // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening. | ||
| 434 | // WHY is this happening? | ||
| 435 |         if (!is_null($this->nodes)) { | ||
| 436 |             foreach ($this->nodes as $n) { | ||
| 437 | $ret .= $this->convert_text($n->text()); | ||
| 438 | } | ||
| 439 | |||
| 440 | // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all. | ||
| 441 |             if ($this->tag == "span") { | ||
| 442 | $ret .= $this->dom->default_span_text; | ||
| 443 | } | ||
| 444 | } | ||
| 445 | return $ret; | ||
| 446 | } | ||
| 447 | |||
| 448 | public function xmltext() | ||
| 449 |     { | ||
| 450 | $ret = $this->innertext(); | ||
| 451 |         $ret = str_ireplace('<![CDATA[', '', $ret); | ||
| 452 |         $ret = str_replace(']]>', '', $ret); | ||
| 453 | return $ret; | ||
| 454 | } | ||
| 455 | |||
| 456 | // build node's text with tag | ||
| 457 | public function makeup() | ||
| 458 |     { | ||
| 459 | // text, comment, unknown | ||
| 460 |         if (isset($this->_[HDOM_INFO_TEXT])) { | ||
| 461 | return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); | ||
| 462 | } | ||
| 463 | |||
| 464 | $ret = '<'.$this->tag; | ||
| 465 | $i = -1; | ||
| 466 | |||
| 467 |         foreach ($this->attr as $key=>$val) { | ||
| 468 | ++$i; | ||
| 469 | |||
| 470 | // skip removed attribute | ||
| 471 |             if ($val===null || $val===false) { | ||
| 472 | continue; | ||
| 473 | } | ||
| 474 | |||
| 475 | $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; | ||
| 476 | //no value attr: nowrap, checked selected... | ||
| 477 |             if ($val===true) { | ||
| 478 | $ret .= $key; | ||
| 479 |             } else { | ||
| 480 |                 switch ($this->_[HDOM_INFO_QUOTE][$i]) { | ||
| 481 | case HDOM_QUOTE_DOUBLE: $quote = '"'; break; | ||
| 482 | case HDOM_QUOTE_SINGLE: $quote = '\''; break; | ||
| 483 | default: $quote = ''; | ||
| 484 | } | ||
| 485 | $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote; | ||
| 486 | } | ||
| 487 | } | ||
| 488 | $ret = $this->dom->restore_noise($ret); | ||
| 489 | return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; | ||
| 490 | } | ||
| 491 | |||
| 492 | // find elements by css selector | ||
| 493 | //PaperG - added ability for find to lowercase the value of the selector. | ||
| 494 | public function find($selector, $idx=null, $lowercase=false) | ||
| 495 |     { | ||
| 496 | $selectors = $this->parse_selector($selector); | ||
| 497 |         if (($count=count($selectors))===0) { | ||
| 498 | return []; | ||
| 499 | } | ||
| 500 | $found_keys = []; | ||
| 501 | |||
| 502 | // find each selector | ||
| 503 |         for ($c=0; $c<$count; ++$c) { | ||
| 504 | // The change on the below line was documented on the sourceforge code tracker id 2788009 | ||
| 505 | // used to be: if (($levle=count($selectors[0]))===0) return []; | ||
| 506 |             if (($levle=count($selectors[$c]))===0) { | ||
| 507 | return []; | ||
| 508 | } | ||
| 509 |             if (!isset($this->_[HDOM_INFO_BEGIN])) { | ||
| 510 | return []; | ||
| 511 | } | ||
| 512 | |||
| 513 | $head = array($this->_[HDOM_INFO_BEGIN]=>1); | ||
| 514 | |||
| 515 | // handle descendant selectors, no recursive! | ||
| 516 |             for ($l=0; $l<$levle; ++$l) { | ||
| 517 | $ret = []; | ||
| 518 |                 foreach ($head as $k=>$v) { | ||
| 519 | $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k]; | ||
| 520 | //PaperG - Pass this optional parameter on to the seek function. | ||
| 521 | $n->seek($selectors[$c][$l], $ret, $lowercase); | ||
| 522 | } | ||
| 523 | $head = $ret; | ||
| 524 | } | ||
| 525 | |||
| 526 |             foreach ($head as $k=>$v) { | ||
| 527 |                 if (!isset($found_keys[$k])) { | ||
| 528 | $found_keys[$k] = 1; | ||
| 529 | } | ||
| 530 | } | ||
| 531 | } | ||
| 532 | |||
| 533 | // sort keys | ||
| 534 | ksort($found_keys); | ||
| 535 | |||
| 536 | $found = []; | ||
| 537 |         foreach ($found_keys as $k=>$v) { | ||
| 538 | $found[] = $this->dom->nodes[$k]; | ||
| 539 | } | ||
| 540 | |||
| 541 | // return nth-element or array | ||
| 542 |         if (is_null($idx)) { | ||
| 543 | return $found; | ||
| 544 |         } elseif ($idx<0) { | ||
| 545 | $idx = count($found) + $idx; | ||
| 546 | } | ||
| 547 | return (isset($found[$idx])) ? $found[$idx] : null; | ||
| 548 | } | ||
| 549 | |||
| 550 | // seek for given conditions | ||
| 551 | // PaperG - added parameter to allow for case insensitive testing of the value of a selector. | ||
| 552 | protected function seek($selector, &$ret, $lowercase=false) | ||
| 553 |     { | ||
| 554 | global $debug_object; | ||
| 555 |         if (is_object($debug_object)) { | ||
| 556 | $debug_object->debugLogEntry(1); | ||
| 557 | } | ||
| 558 | |||
| 559 | list($tag, $key, $val, $exp, $no_key) = $selector; | ||
| 560 | |||
| 561 | // xpath index | ||
| 562 |         if ($tag && $key && is_numeric($key)) { | ||
| 563 | $count = 0; | ||
| 564 |             foreach ($this->children as $c) { | ||
| 565 |                 if ($tag==='*' || $tag===$c->tag) { | ||
| 566 |                     if (++$count==$key) { | ||
| 567 | $ret[$c->_[HDOM_INFO_BEGIN]] = 1; | ||
| 568 | return; | ||
| 569 | } | ||
| 570 | } | ||
| 571 | } | ||
| 572 | return; | ||
| 573 | } | ||
| 574 | |||
| 575 | $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; | ||
| 576 |         if ($end==0) { | ||
| 577 | $parent = $this->parent; | ||
| 578 |             while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) { | ||
| 579 | $end -= 1; | ||
| 580 | $parent = $parent->parent; | ||
| 581 | } | ||
| 582 | $end += $parent->_[HDOM_INFO_END]; | ||
| 583 | } | ||
| 584 | |||
| 585 |         for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) { | ||
| 586 | $node = $this->dom->nodes[$i]; | ||
| 587 | |||
| 588 | $pass = true; | ||
| 589 | |||
| 590 |             if ($tag==='*' && !$key) { | ||
| 591 |                 if (in_array($node, $this->children, true)) { | ||
| 592 | $ret[$i] = 1; | ||
| 593 | } | ||
| 594 | continue; | ||
| 595 | } | ||
| 596 | |||
| 597 | // compare tag | ||
| 598 |             if ($tag && $tag!=$node->tag && $tag!=='*') { | ||
| 599 | $pass=false; | ||
| 600 | } | ||
| 601 | // compare key | ||
| 602 |             if ($pass && $key) { | ||
| 603 |                 if ($no_key) { | ||
| 604 |                     if (isset($node->attr[$key])) { | ||
| 605 | $pass=false; | ||
| 606 | } | ||
| 607 |                 } else { | ||
| 608 |                     if (($key != "plaintext") && !isset($node->attr[$key])) { | ||
| 609 | $pass=false; | ||
| 610 | } | ||
| 611 | } | ||
| 612 | } | ||
| 613 | // compare value | ||
| 614 |             if ($pass && $key && $val  && $val!=='*') { | ||
| 615 | // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right? | ||
| 616 |                 if ($key == "plaintext") { | ||
| 617 | // $node->plaintext actually returns $node->text(); | ||
| 618 | $nodeKeyValue = $node->text(); | ||
| 619 |                 } else { | ||
| 620 | // this is a normal search, we want the value of that attribute of the tag. | ||
| 621 | $nodeKeyValue = $node->attr[$key]; | ||
| 622 | } | ||
| 623 |                 if (is_object($debug_object)) { | ||
| 624 | $debug_object->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue); | ||
| 625 | } | ||
| 626 | |||
| 627 | //PaperG - If lowercase is set, do a case insensitive test of the value of the selector. | ||
| 628 |                 if ($lowercase) { | ||
| 629 | $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue)); | ||
| 630 |                 } else { | ||
| 631 | $check = $this->match($exp, $val, $nodeKeyValue); | ||
| 632 | } | ||
| 633 |                 if (is_object($debug_object)) { | ||
| 634 | $debug_object->debugLog(2, "after match: " . ($check ? "true" : "false")); | ||
| 635 | } | ||
| 636 | |||
| 637 | // handle multiple class | ||
| 638 |                 if (!$check && strcasecmp($key, 'class')===0) { | ||
| 639 |                     foreach (explode(' ', $node->attr[$key]) as $k) { | ||
| 640 | // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form. | ||
| 641 |                         if (!empty($k)) { | ||
| 642 |                             if ($lowercase) { | ||
| 643 | $check = $this->match($exp, strtolower($val), strtolower($k)); | ||
| 644 |                             } else { | ||
| 645 | $check = $this->match($exp, $val, $k); | ||
| 646 | } | ||
| 647 |                             if ($check) { | ||
| 648 | break; | ||
| 649 | } | ||
| 650 | } | ||
| 651 | } | ||
| 652 | } | ||
| 653 |                 if (!$check) { | ||
| 654 | $pass = false; | ||
| 655 | } | ||
| 656 | } | ||
| 657 |             if ($pass) { | ||
| 658 | $ret[$i] = 1; | ||
| 659 | } | ||
| 660 | unset($node); | ||
| 661 | } | ||
| 662 | // It's passed by reference so this is actually what this function returns. | ||
| 663 |         if (is_object($debug_object)) { | ||
| 664 | $debug_object->debugLog(1, "EXIT - ret: ", $ret); | ||
| 665 | } | ||
| 666 | } | ||
| 667 | |||
| 668 | protected function match($exp, $pattern, $value) | ||
| 669 |     { | ||
| 670 | global $debug_object; | ||
| 671 |         if (is_object($debug_object)) { | ||
| 672 | $debug_object->debugLogEntry(1); | ||
| 673 | } | ||
| 674 | |||
| 675 |         switch ($exp) { | ||
| 676 | case '=': | ||
| 677 | return ($value===$pattern); | ||
| 678 | case '!=': | ||
| 679 | return ($value!==$pattern); | ||
| 680 | case '^=': | ||
| 681 |                 return preg_match("/^".preg_quote($pattern, '/')."/", $value); | ||
| 682 | case '$=': | ||
| 683 |                 return preg_match("/".preg_quote($pattern, '/')."$/", $value); | ||
| 684 | case '*=': | ||
| 685 |                 if ($pattern[0]=='/') { | ||
| 686 | return preg_match($pattern, $value); | ||
| 687 | } | ||
| 688 |                 return preg_match("/".$pattern."/i", $value); | ||
| 689 | } | ||
| 690 | return false; | ||
| 691 | } | ||
| 692 | |||
| 693 | protected function parse_selector($selector_string) | ||
| 694 |     { | ||
| 695 | global $debug_object; | ||
| 696 |         if (is_object($debug_object)) { | ||
| 697 | $debug_object->debugLogEntry(1); | ||
| 698 | } | ||
| 699 | |||
| 700 | // pattern of CSS selectors, modified from mootools | ||
| 701 | // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does. | ||
| 702 | // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check. | ||
| 703 | // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured. | ||
| 704 | // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression. | ||
| 705 | // farther study is required to determine of this should be documented or removed. | ||
| 706 | // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; | ||
| 707 | $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; | ||
| 708 | preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); | ||
| 709 |         if (is_object($debug_object)) { | ||
| 710 | $debug_object->debugLog(2, "Matches Array: ", $matches); | ||
| 711 | } | ||
| 712 | |||
| 713 | $selectors = []; | ||
| 714 | $result = []; | ||
| 715 | //print_r($matches); | ||
| 716 | |||
| 717 |         foreach ($matches as $m) { | ||
| 718 | $m[0] = trim($m[0]); | ||
| 719 |             if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') { | ||
| 720 | continue; | ||
| 721 | } | ||
| 722 | // for browser generated xpath | ||
| 723 |             if ($m[1]==='tbody') { | ||
| 724 | continue; | ||
| 725 | } | ||
| 726 | |||
| 727 | list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false); | ||
| 728 |             if (!empty($m[2])) { | ||
| 729 | $key='id'; | ||
| 730 | $val=$m[2]; | ||
| 731 | } | ||
| 732 |             if (!empty($m[3])) { | ||
| 733 | $key='class'; | ||
| 734 | $val=$m[3]; | ||
| 735 | } | ||
| 736 |             if (!empty($m[4])) { | ||
| 737 | $key=$m[4]; | ||
| 738 | } | ||
| 739 |             if (!empty($m[5])) { | ||
| 740 | $exp=$m[5]; | ||
| 741 | } | ||
| 742 |             if (!empty($m[6])) { | ||
| 743 | $val=$m[6]; | ||
| 744 | } | ||
| 745 | |||
| 746 | // convert to lowercase | ||
| 747 |             if ($this->dom->lowercase) { | ||
| 748 | $tag=strtolower($tag); | ||
| 749 | $key=strtolower($key); | ||
| 750 | } | ||
| 751 | //elements that do NOT have the specified attribute | ||
| 752 |             if (isset($key[0]) && $key[0]==='!') { | ||
| 753 | $key=substr($key, 1); | ||
| 754 | $no_key=true; | ||
| 755 | } | ||
| 756 | |||
| 757 | $result[] = array($tag, $key, $val, $exp, $no_key); | ||
| 758 |             if (trim($m[7])===',') { | ||
| 759 | $selectors[] = $result; | ||
| 760 | $result = []; | ||
| 761 | } | ||
| 762 | } | ||
| 763 |         if (count($result)>0) { | ||
| 764 | $selectors[] = $result; | ||
| 765 | } | ||
| 766 | return $selectors; | ||
| 767 | } | ||
| 768 | |||
| 769 | public function __get($name) | ||
| 770 |     { | ||
| 771 |         if (isset($this->attr[$name])) { | ||
| 772 | return $this->convert_text($this->attr[$name]); | ||
| 773 | } | ||
| 774 |         switch ($name) { | ||
| 775 | case 'outertext': return $this->outertext(); | ||
| 776 | case 'innertext': return $this->innertext(); | ||
| 777 | case 'plaintext': return $this->text(); | ||
| 778 | case 'xmltext': return $this->xmltext(); | ||
| 779 | default: return array_key_exists($name, $this->attr); | ||
| 780 | } | ||
| 781 | } | ||
| 782 | |||
| 783 | public function __set($name, $value) | ||
| 784 |     { | ||
| 785 |         switch ($name) { | ||
| 786 | case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; | ||
| 787 | case 'innertext': | ||
| 788 |                 if (isset($this->_[HDOM_INFO_TEXT])) { | ||
| 789 | return $this->_[HDOM_INFO_TEXT] = $value; | ||
| 790 | } | ||
| 791 | return $this->_[HDOM_INFO_INNER] = $value; | ||
| 792 | } | ||
| 793 |         if (!isset($this->attr[$name])) { | ||
| 794 |             $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); | ||
| 795 | $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; | ||
| 796 | } | ||
| 797 | $this->attr[$name] = $value; | ||
| 798 | } | ||
| 799 | |||
| 800 | public function __isset($name) | ||
| 801 |     { | ||
| 802 |         switch ($name) { | ||
| 803 | case 'outertext': return true; | ||
| 804 | case 'innertext': return true; | ||
| 805 | case 'plaintext': return true; | ||
| 806 | } | ||
| 807 | //no value attr: nowrap, checked selected... | ||
| 808 | return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); | ||
| 809 | } | ||
| 810 | |||
| 811 | public function __unset($name) | ||
| 812 |     { | ||
| 813 |         if (isset($this->attr[$name])) { | ||
| 814 | unset($this->attr[$name]); | ||
| 815 | } | ||
| 816 | } | ||
| 817 | |||
| 818 | // PaperG - Function to convert the text from one character set to another if the two sets are not the same. | ||
| 819 | public function convert_text($text) | ||
| 820 |     { | ||
| 821 | global $debug_object; | ||
| 822 |         if (is_object($debug_object)) { | ||
| 823 | $debug_object->debugLogEntry(1); | ||
| 824 | } | ||
| 825 | |||
| 826 | $converted_text = $text; | ||
| 827 | |||
| 828 | $sourceCharset = ""; | ||
| 829 | $targetCharset = ""; | ||
| 830 | |||
| 831 |         if ($this->dom) { | ||
| 832 | $sourceCharset = strtoupper($this->dom->_charset); | ||
| 833 | $targetCharset = strtoupper($this->dom->_target_charset); | ||
| 834 | } | ||
| 835 |         if (is_object($debug_object)) { | ||
| 836 | $debug_object->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset); | ||
| 837 | } | ||
| 838 | |||
| 839 |         if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0)) { | ||
| 840 | // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 | ||
| 841 |             if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text))) { | ||
| 842 | $converted_text = $text; | ||
| 843 |             } else { | ||
| 844 | $converted_text = iconv($sourceCharset, $targetCharset, $text); | ||
| 845 | } | ||
| 846 | } | ||
| 847 | |||
| 848 | // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. | ||
| 849 |         if ($targetCharset == 'UTF-8') { | ||
| 850 |             if (substr($converted_text, 0, 3) == "\xef\xbb\xbf") { | ||
| 851 | $converted_text = substr($converted_text, 3); | ||
| 852 | } | ||
| 853 |             if (substr($converted_text, -3) == "\xef\xbb\xbf") { | ||
| 854 | $converted_text = substr($converted_text, 0, -3); | ||
| 855 | } | ||
| 856 | } | ||
| 857 | |||
| 858 | return $converted_text; | ||
| 859 | } | ||
| 860 | |||
| 861 | /** | ||
| 862 | * Returns true if $string is valid UTF-8 and false otherwise. | ||
| 863 | * | ||
| 864 | * @param mixed $str String to be tested | ||
| 865 | * @return boolean | ||
| 866 | */ | ||
| 867 | public static function is_utf8($str) | ||
| 868 |     { | ||
| 869 | $c=0; | ||
| 870 | $b=0; | ||
| 871 | $bits=0; | ||
| 872 | $len=strlen($str); | ||
| 873 |         for ($i=0; $i<$len; $i++) { | ||
| 874 | $c=ord($str[$i]); | ||
| 875 |             if ($c > 128) { | ||
| 876 |                 if (($c >= 254)) { | ||
| 877 | return false; | ||
| 878 |                 } elseif ($c >= 252) { | ||
| 879 | $bits=6; | ||
| 880 |                 } elseif ($c >= 248) { | ||
| 881 | $bits=5; | ||
| 882 |                 } elseif ($c >= 240) { | ||
| 883 | $bits=4; | ||
| 884 |                 } elseif ($c >= 224) { | ||
| 885 | $bits=3; | ||
| 886 |                 } elseif ($c >= 192) { | ||
| 887 | $bits=2; | ||
| 888 |                 } else { | ||
| 889 | return false; | ||
| 890 | } | ||
| 891 |                 if (($i+$bits) > $len) { | ||
| 892 | return false; | ||
| 893 | } | ||
| 894 |                 while ($bits > 1) { | ||
| 895 | $i++; | ||
| 896 | $b=ord($str[$i]); | ||
| 897 |                     if ($b < 128 || $b > 191) { | ||
| 898 | return false; | ||
| 899 | } | ||
| 900 | $bits--; | ||
| 901 | } | ||
| 902 | } | ||
| 903 | } | ||
| 904 | return true; | ||
| 905 | } | ||
| 906 | /* | ||
| 907 | function is_utf8($string) | ||
| 908 |     { | ||
| 909 | //this is buggy | ||
| 910 | return (utf8_encode(utf8_decode($string)) == $string); | ||
| 911 | } | ||
| 912 | */ | ||
| 913 | |||
| 914 | /** | ||
| 915 | * Function to try a few tricks to determine the displayed size of an img on the page. | ||
| 916 | * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types. | ||
| 917 | * | ||
| 918 | * @author John Schlick | ||
| 919 | * @version April 19 2012 | ||
| 920 | * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out. | ||
| 921 | */ | ||
| 922 | public function get_display_size() | ||
| 923 |     { | ||
| 924 | global $debug_object; | ||
| 925 | |||
| 926 | $width = -1; | ||
| 927 | $height = -1; | ||
| 928 | |||
| 929 |         if ($this->tag !== 'img') { | ||
| 930 | return false; | ||
| 931 | } | ||
| 932 | |||
| 933 | // See if there is aheight or width attribute in the tag itself. | ||
| 934 |         if (isset($this->attr['width'])) { | ||
| 935 | $width = $this->attr['width']; | ||
| 936 | } | ||
| 937 | |||
| 938 |         if (isset($this->attr['height'])) { | ||
| 939 | $height = $this->attr['height']; | ||
| 940 | } | ||
| 941 | |||
| 942 | // Now look for an inline style. | ||
| 943 |         if (isset($this->attr['style'])) { | ||
| 944 | // Thanks to user gnarf from stackoverflow for this regular expression. | ||
| 945 | $attributes = []; | ||
| 946 |             preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER); | ||
| 947 |             foreach ($matches as $match) { | ||
| 948 | $attributes[$match[1]] = $match[2]; | ||
| 949 | } | ||
| 950 | |||
| 951 | // If there is a width in the style attributes: | ||
| 952 |             if (isset($attributes['width']) && $width == -1) { | ||
| 953 | // check that the last two characters are px (pixels) | ||
| 954 |                 if (strtolower(substr($attributes['width'], -2)) == 'px') { | ||
| 955 | $proposed_width = substr($attributes['width'], 0, -2); | ||
| 956 | // Now make sure that it's an integer and not something stupid. | ||
| 957 |                     if (filter_var($proposed_width, FILTER_VALIDATE_INT)) { | ||
| 958 | $width = $proposed_width; | ||
| 959 | } | ||
| 960 | } | ||
| 961 | } | ||
| 962 | |||
| 963 | // If there is a width in the style attributes: | ||
| 964 |             if (isset($attributes['height']) && $height == -1) { | ||
| 965 | // check that the last two characters are px (pixels) | ||
| 966 |                 if (strtolower(substr($attributes['height'], -2)) == 'px') { | ||
| 967 | $proposed_height = substr($attributes['height'], 0, -2); | ||
| 968 | // Now make sure that it's an integer and not something stupid. | ||
| 969 |                     if (filter_var($proposed_height, FILTER_VALIDATE_INT)) { | ||
| 970 | $height = $proposed_height; | ||
| 971 | } | ||
| 972 | } | ||
| 973 | } | ||
| 974 | } | ||
| 975 | |||
| 976 | // Future enhancement: | ||
| 977 | // Look in the tag to see if there is a class or id specified that has a height or width attribute to it. | ||
| 978 | |||
| 979 | // Far future enhancement | ||
| 980 | // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width | ||
| 981 | // Note that in this case, the class or id will have the img subselector for it to apply to the image. | ||
| 982 | |||
| 983 | // ridiculously far future development | ||
| 984 | // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page. | ||
| 985 | |||
| 986 |         $result = array('height' => $height, | ||
| 987 | 'width' => $width); | ||
| 988 | return $result; | ||
| 989 | } | ||
| 990 | |||
| 991 | // camel naming conventions | ||
| 992 | public function getAllAttributes() | ||
| 993 |     { | ||
| 994 | return $this->attr; | ||
| 995 | } | ||
| 996 | public function getAttribute($name) | ||
| 997 |     { | ||
| 998 | return $this->__get($name); | ||
| 999 | } | ||
| 1000 | public function setAttribute($name, $value) | ||
| 1003 | } | ||
| 1004 | public function hasAttribute($name) | ||
| 1005 |     { | ||
| 1006 | return $this->__isset($name); | ||
| 1007 | } | ||
| 1008 | public function removeAttribute($name) | ||
| 1009 |     { | ||
| 1010 | $this->__set($name, null); | ||
| 1011 | } | ||
| 1012 | public function getElementById($id) | ||
| 1013 |     { | ||
| 1014 |         return $this->find("#$id", 0); | ||
| 1015 | } | ||
| 1016 | public function getElementsById($id, $idx=null) | ||
| 1017 |     { | ||
| 1018 |         return $this->find("#$id", $idx); | ||
| 1019 | } | ||
| 1020 | public function getElementByTagName($name) | ||
| 1021 |     { | ||
| 1022 | return $this->find($name, 0); | ||
| 1023 | } | ||
| 1024 | public function getElementsByTagName($name, $idx=null) | ||
| 1025 |     { | ||
| 1026 | return $this->find($name, $idx); | ||
| 1027 | } | ||
| 1028 | public function parentNode() | ||
| 1029 |     { | ||
| 1030 | return $this->parent(); | ||
| 1031 | } | ||
| 1032 | public function childNodes($idx=-1) | ||
| 1033 |     { | ||
| 1034 | return $this->children($idx); | ||
| 1035 | } | ||
| 1036 | public function firstChild() | ||
| 1037 |     { | ||
| 1038 | return $this->first_child(); | ||
| 1039 | } | ||
| 1040 | public function lastChild() | ||
| 1041 |     { | ||
| 1042 | return $this->last_child(); | ||
| 1043 | } | ||
| 1044 | public function nextSibling() | ||
| 1047 | } | ||
| 1048 | public function previousSibling() | ||
| 1049 |     { | ||
| 1050 | return $this->prev_sibling(); | ||
| 1051 | } | ||
| 1052 | public function hasChildNodes() | ||
| 1053 |     { | ||
| 1054 | return $this->has_child(); | ||
| 1055 | } | ||
| 1056 | public function nodeName() | ||
| 1057 |     { | ||
| 1058 | return $this->tag; | ||
| 1059 | } | ||
| 1060 | public function appendChild($node) | ||
| 1064 | } | ||
| 1065 | } | ||
| 1066 | |||
| 1067 | /** | ||
| 1068 | * simple html dom parser | ||
| 1069 | * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector. | ||
| 1070 | * Paperg - change $size from protected to public so we can easily access it | ||
| 1071 | * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it. | ||
| 1072 | * | ||
| 1073 | * @package PlaceLocalInclude | ||
| 1074 | */ | ||
| 1075 | class simple_html_dom | ||
| 1076 | { | ||
| 1077 | public $root = null; | ||
| 1078 | public $nodes = []; | ||
| 1079 | public $callback = null; | ||
| 1080 | public $lowercase = false; | ||
| 1081 | // Used to keep track of how large the text was when we started. | ||
| 1082 | public $original_size; | ||
| 1083 | public $size; | ||
| 1084 | protected $pos; | ||
| 1085 | protected $doc; | ||
| 1086 | protected $char; | ||
| 1087 | protected $cursor; | ||
| 1088 | protected $parent; | ||
| 1089 | protected $noise = []; | ||
| 1090 | protected $token_blank = " \t\r\n"; | ||
| 1091 | protected $token_equal = ' =/>'; | ||
| 1092 | protected $token_slash = " />\r\n\t"; | ||
| 1093 | protected $token_attr = ' >'; | ||
| 1094 | // Note that this is referenced by a child node, and so it needs to be public for that node to see this information. | ||
| 1095 | public $_charset = ''; | ||
| 1096 | public $_target_charset = ''; | ||
| 1097 | protected $default_br_text = ""; | ||
| 1098 | public $default_span_text = ""; | ||
| 1099 | |||
| 1100 | // use isset instead of in_array, performance boost about 30%... | ||
| 1101 |     protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1); | ||
| 1102 |     protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1); | ||
| 1103 | // Known sourceforge issue #2977341 | ||
| 1871 | 
This check looks for parameters that have been defined for a function or method, but which are not used in the method body.