Total Complexity | 244 |
Total Lines | 993 |
Duplicated Lines | 0 % |
Changes | 0 |
Complex classes like simple_html_dom_node often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use simple_html_dom_node, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
117 | class simple_html_dom_node |
||
118 | { |
||
119 | public $nodetype = HDOM_TYPE_TEXT; |
||
120 | public $tag = 'text'; |
||
121 | public $attr = []; |
||
122 | /** @var simple_html_dom_node[] $children */ |
||
123 | public $children = []; |
||
124 | public $nodes = []; |
||
125 | public $parent = null; |
||
126 | // The "info" array - see HDOM_INFO_... for what each element contains. |
||
127 | public $_ = []; |
||
128 | public $tag_start = 0; |
||
129 | private $dom = null; |
||
130 | |||
131 | public function __construct(simple_html_dom $dom) |
||
132 | { |
||
133 | $this->dom = $dom; |
||
134 | $dom->nodes[] = $this; |
||
135 | } |
||
136 | |||
137 | public function __destruct() |
||
138 | { |
||
139 | $this->clear(); |
||
140 | } |
||
141 | |||
142 | public function __toString() |
||
143 | { |
||
144 | return $this->outertext(); |
||
145 | } |
||
146 | |||
147 | // clean up memory due to php5 circular references memory leak... |
||
148 | public function clear() |
||
149 | { |
||
150 | $this->dom = null; |
||
151 | $this->nodes = null; |
||
152 | $this->parent = null; |
||
153 | $this->children = null; |
||
154 | } |
||
155 | |||
156 | // dump node's tree |
||
157 | public function dump($show_attr = true, $deep = 0) |
||
158 | { |
||
159 | $lead = str_repeat(' ', $deep); |
||
160 | |||
161 | echo $lead.$this->tag; |
||
162 | if ($show_attr && count($this->attr) > 0) { |
||
163 | echo '('; |
||
164 | foreach ($this->attr as $k=>$v) { |
||
165 | echo "[$k]=>\"".$this->$k.'", '; |
||
166 | } |
||
167 | echo ')'; |
||
168 | } |
||
169 | echo "\n"; |
||
170 | |||
171 | if ($this->nodes) { |
||
172 | foreach ($this->nodes as $c) { |
||
173 | $c->dump($show_attr, $deep + 1); |
||
174 | } |
||
175 | } |
||
176 | } |
||
177 | |||
178 | // Debugging function to dump a single dom node with a bunch of information about it. |
||
179 | public function dump_node($echo = true) |
||
180 | { |
||
181 | $string = $this->tag; |
||
182 | if (count($this->attr) > 0) { |
||
183 | $string .= '('; |
||
184 | foreach ($this->attr as $k=>$v) { |
||
185 | $string .= "[$k]=>\"".$this->$k.'", '; |
||
186 | } |
||
187 | $string .= ')'; |
||
188 | } |
||
189 | if (count($this->_) > 0) { |
||
190 | $string .= ' $_ ('; |
||
191 | foreach ($this->_ as $k=>$v) { |
||
192 | if (is_array($v)) { |
||
193 | $string .= "[$k]=>("; |
||
194 | foreach ($v as $k2=>$v2) { |
||
195 | $string .= "[$k2]=>\"".$v2.'", '; |
||
196 | } |
||
197 | $string .= ')'; |
||
198 | } else { |
||
199 | $string .= "[$k]=>\"".$v.'", '; |
||
200 | } |
||
201 | } |
||
202 | $string .= ')'; |
||
203 | } |
||
204 | |||
205 | if (isset($this->text)) { |
||
206 | $string .= ' text: ('.$this->text.')'; |
||
207 | } |
||
208 | |||
209 | $string .= " HDOM_INNER_INFO: '"; |
||
210 | if (isset($node->_[HDOM_INFO_INNER])) { |
||
211 | $string .= $node->_[HDOM_INFO_INNER]."'"; |
||
212 | } else { |
||
213 | $string .= ' NULL '; |
||
214 | } |
||
215 | |||
216 | $string .= ' children: '.count($this->children); |
||
217 | $string .= ' nodes: '.count($this->nodes); |
||
218 | $string .= ' tag_start: '.$this->tag_start; |
||
219 | $string .= "\n"; |
||
220 | |||
221 | if ($echo) { |
||
222 | echo $string; |
||
223 | |||
224 | return; |
||
225 | } else { |
||
226 | return $string; |
||
227 | } |
||
228 | } |
||
229 | |||
230 | // returns the parent of node |
||
231 | // If a node is passed in, it will reset the parent of the current node to that one. |
||
232 | public function parent($parent = null) |
||
233 | { |
||
234 | // I am SURE that this doesn't work properly. |
||
235 | // It fails to unset the current node from it's current parents nodes or children list first. |
||
236 | if ($parent !== null) { |
||
237 | $this->parent = $parent; |
||
238 | $this->parent->nodes[] = $this; |
||
239 | $this->parent->children[] = $this; |
||
240 | } |
||
241 | |||
242 | return $this->parent; |
||
243 | } |
||
244 | |||
245 | // verify that node has children |
||
246 | public function has_child() |
||
247 | { |
||
248 | return !empty($this->children); |
||
249 | } |
||
250 | |||
251 | // returns children of node |
||
252 | public function children($idx = -1) |
||
253 | { |
||
254 | if ($idx === -1) { |
||
255 | return $this->children; |
||
256 | } |
||
257 | if (isset($this->children[$idx])) { |
||
258 | return $this->children[$idx]; |
||
259 | } |
||
260 | } |
||
261 | |||
262 | // returns the first child of node |
||
263 | public function first_child() |
||
264 | { |
||
265 | if (count($this->children) > 0) { |
||
266 | return $this->children[0]; |
||
267 | } |
||
268 | } |
||
269 | |||
270 | // returns the last child of node |
||
271 | public function last_child() |
||
272 | { |
||
273 | if (($count = count($this->children)) > 0) { |
||
274 | return $this->children[$count - 1]; |
||
275 | } |
||
276 | } |
||
277 | |||
278 | // returns the next sibling of node |
||
279 | public function next_sibling() |
||
295 | } |
||
296 | |||
297 | // returns the previous sibling of node |
||
298 | public function prev_sibling() |
||
299 | { |
||
300 | if ($this->parent === null) { |
||
301 | return; |
||
302 | } |
||
303 | $idx = 0; |
||
304 | $count = count($this->parent->children); |
||
305 | while ($idx < $count && $this !== $this->parent->children[$idx]) { |
||
306 | ++$idx; |
||
307 | } |
||
308 | if (--$idx < 0) { |
||
309 | return; |
||
310 | } |
||
311 | |||
312 | return $this->parent->children[$idx]; |
||
313 | } |
||
314 | |||
315 | // function to locate a specific ancestor tag in the path to the root. |
||
316 | public function find_ancestor_tag($tag) |
||
317 | { |
||
318 | global $debugObject; |
||
319 | if (is_object($debugObject)) { |
||
320 | $debugObject->debugLogEntry(1); |
||
321 | } |
||
322 | |||
323 | // Start by including ourselves in the comparison. |
||
324 | $returnDom = $this; |
||
325 | |||
326 | while (!is_null($returnDom)) { |
||
327 | if (is_object($debugObject)) { |
||
328 | $debugObject->debugLog(2, 'Current tag is: '.$returnDom->tag); |
||
329 | } |
||
330 | |||
331 | if ($returnDom->tag == $tag) { |
||
332 | break; |
||
333 | } |
||
334 | $returnDom = $returnDom->parent; |
||
335 | } |
||
336 | |||
337 | return $returnDom; |
||
338 | } |
||
339 | |||
340 | // get dom node's inner html |
||
341 | public function innertext() |
||
342 | { |
||
343 | if (isset($this->_[HDOM_INFO_INNER])) { |
||
344 | return $this->_[HDOM_INFO_INNER]; |
||
345 | } |
||
346 | if (isset($this->_[HDOM_INFO_TEXT])) { |
||
347 | return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); |
||
348 | } |
||
349 | |||
350 | $ret = ''; |
||
351 | foreach ($this->nodes as $n) { |
||
352 | $ret .= $n->outertext(); |
||
353 | } |
||
354 | |||
355 | return $ret; |
||
356 | } |
||
357 | |||
358 | // get dom node's outer text (with tag) |
||
359 | public function outertext() |
||
360 | { |
||
361 | global $debugObject; |
||
362 | if (is_object($debugObject)) { |
||
363 | $text = ''; |
||
364 | if ($this->tag == 'text') { |
||
365 | if (!empty($this->text)) { |
||
366 | $text = ' with text: '.$this->text; |
||
367 | } |
||
368 | } |
||
369 | $debugObject->debugLog(1, 'Innertext of tag: '.$this->tag.$text); |
||
370 | } |
||
371 | |||
372 | if ($this->tag === 'root') { |
||
373 | return $this->innertext(); |
||
374 | } |
||
375 | |||
376 | // trigger callback |
||
377 | if ($this->dom && $this->dom->callback !== null) { |
||
378 | call_user_func_array($this->dom->callback, [$this]); |
||
379 | } |
||
380 | |||
381 | if (isset($this->_[HDOM_INFO_OUTER])) { |
||
382 | return $this->_[HDOM_INFO_OUTER]; |
||
383 | } |
||
384 | if (isset($this->_[HDOM_INFO_TEXT])) { |
||
385 | return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); |
||
386 | } |
||
387 | |||
388 | // render begin tag |
||
389 | if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) { |
||
390 | $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); |
||
391 | } else { |
||
392 | $ret = ''; |
||
393 | } |
||
394 | |||
395 | // render inner text |
||
396 | if (isset($this->_[HDOM_INFO_INNER])) { |
||
397 | // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added. |
||
398 | if ($this->tag != 'br') { |
||
399 | $ret .= $this->_[HDOM_INFO_INNER]; |
||
400 | } |
||
401 | } else { |
||
402 | if ($this->nodes) { |
||
403 | foreach ($this->nodes as $n) { |
||
404 | $ret .= $this->convert_text($n->outertext()); |
||
405 | } |
||
406 | } |
||
407 | } |
||
408 | |||
409 | // render end tag |
||
410 | if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) { |
||
411 | $ret .= '</'.$this->tag.'>'; |
||
412 | } |
||
413 | |||
414 | return $ret; |
||
415 | } |
||
416 | |||
417 | // get dom node's plain text |
||
418 | public function text() |
||
419 | { |
||
420 | if (isset($this->_[HDOM_INFO_INNER])) { |
||
421 | return $this->_[HDOM_INFO_INNER]; |
||
422 | } |
||
423 | switch ($this->nodetype) { |
||
424 | case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); |
||
425 | case HDOM_TYPE_COMMENT: return ''; |
||
426 | case HDOM_TYPE_UNKNOWN: return ''; |
||
427 | } |
||
428 | if (strcasecmp($this->tag, 'script') === 0) { |
||
429 | return ''; |
||
430 | } |
||
431 | if (strcasecmp($this->tag, 'style') === 0) { |
||
432 | return ''; |
||
433 | } |
||
434 | |||
435 | $ret = ''; |
||
436 | // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL. |
||
437 | // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening. |
||
438 | // WHY is this happening? |
||
439 | if (!is_null($this->nodes)) { |
||
440 | foreach ($this->nodes as $n) { |
||
441 | $ret .= $this->convert_text($n->text()); |
||
442 | } |
||
443 | |||
444 | // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all. |
||
445 | if ($this->tag == 'span') { |
||
446 | $ret .= $this->dom->default_span_text; |
||
447 | } |
||
448 | } |
||
449 | |||
450 | return $ret; |
||
451 | } |
||
452 | |||
453 | public function xmltext() |
||
454 | { |
||
455 | $ret = $this->innertext(); |
||
456 | $ret = str_ireplace('<![CDATA[', '', $ret); |
||
457 | $ret = str_replace(']]>', '', $ret); |
||
458 | |||
459 | return $ret; |
||
460 | } |
||
461 | |||
462 | // build node's text with tag |
||
463 | public function makeup() |
||
464 | { |
||
465 | // text, comment, unknown |
||
466 | if (isset($this->_[HDOM_INFO_TEXT])) { |
||
467 | return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); |
||
468 | } |
||
469 | |||
470 | $ret = '<'.$this->tag; |
||
471 | $i = -1; |
||
472 | |||
473 | foreach ($this->attr as $key=>$val) { |
||
474 | $i++; |
||
475 | |||
476 | // skip removed attribute |
||
477 | if ($val === null || $val === false) { |
||
478 | continue; |
||
479 | } |
||
480 | |||
481 | $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; |
||
482 | //no value attr: nowrap, checked selected... |
||
483 | if ($val === true) { |
||
484 | $ret .= $key; |
||
485 | } else { |
||
486 | switch ($this->_[HDOM_INFO_QUOTE][$i]) { |
||
487 | case HDOM_QUOTE_DOUBLE: $quote = '"'; break; |
||
488 | case HDOM_QUOTE_SINGLE: $quote = '\''; break; |
||
489 | default: $quote = ''; |
||
490 | } |
||
491 | $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote; |
||
492 | } |
||
493 | } |
||
494 | $ret = $this->dom->restore_noise($ret); |
||
495 | |||
496 | return $ret.$this->_[HDOM_INFO_ENDSPACE].'>'; |
||
497 | } |
||
498 | |||
499 | /** |
||
500 | * find elements by css selector |
||
501 | * PaperG - added ability for find to lowercase the value of the selector. |
||
502 | * |
||
503 | * @param string $selector |
||
504 | * @param int|null $idx |
||
505 | * @param bool $lowercase |
||
506 | * |
||
507 | * @return simple_html_dom_node[]|simple_html_dom_node|null |
||
508 | */ |
||
509 | public function find($selector, $idx = null, $lowercase = false) |
||
510 | { |
||
511 | $selectors = $this->parse_selector($selector); |
||
512 | if (($count = count($selectors)) === 0) { |
||
513 | return []; |
||
514 | } |
||
515 | $found_keys = []; |
||
516 | |||
517 | // find each selector |
||
518 | for ($c = 0; $c < $count; $c++) { |
||
519 | // The change on the below line was documented on the sourceforge code tracker id 2788009 |
||
520 | // used to be: if (($levle=count($selectors[0]))===0) return array(); |
||
521 | if (($levle = count($selectors[$c])) === 0) { |
||
522 | return []; |
||
523 | } |
||
524 | if (!isset($this->_[HDOM_INFO_BEGIN])) { |
||
525 | return []; |
||
526 | } |
||
527 | |||
528 | $head = [$this->_[HDOM_INFO_BEGIN]=>1]; |
||
529 | |||
530 | // handle descendant selectors, no recursive! |
||
531 | for ($l = 0; $l < $levle; $l++) { |
||
532 | $ret = []; |
||
533 | foreach ($head as $k=>$v) { |
||
534 | $n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k]; |
||
535 | //PaperG - Pass this optional parameter on to the seek function. |
||
536 | $n->seek($selectors[$c][$l], $ret, $lowercase); |
||
537 | } |
||
538 | $head = $ret; |
||
539 | } |
||
540 | |||
541 | foreach ($head as $k=>$v) { |
||
542 | if (!isset($found_keys[$k])) { |
||
543 | $found_keys[$k] = 1; |
||
544 | } |
||
545 | } |
||
546 | } |
||
547 | |||
548 | // sort keys |
||
549 | ksort($found_keys); |
||
550 | |||
551 | $found = []; |
||
552 | foreach ($found_keys as $k=>$v) { |
||
553 | $found[] = $this->dom->nodes[$k]; |
||
554 | } |
||
555 | |||
556 | // return nth-element or array |
||
557 | if (is_null($idx)) { |
||
558 | return $found; |
||
559 | } elseif ($idx < 0) { |
||
560 | $idx = count($found) + $idx; |
||
561 | } |
||
562 | |||
563 | return (isset($found[$idx])) ? $found[$idx] : null; |
||
564 | } |
||
565 | |||
566 | // seek for given conditions |
||
567 | // PaperG - added parameter to allow for case insensitive testing of the value of a selector. |
||
568 | protected function seek($selector, &$ret, $lowercase = false) |
||
569 | { |
||
570 | global $debugObject; |
||
571 | if (is_object($debugObject)) { |
||
572 | $debugObject->debugLogEntry(1); |
||
573 | } |
||
574 | |||
575 | list($tag, $key, $val, $exp, $no_key) = $selector; |
||
576 | |||
577 | // xpath index |
||
578 | if ($tag && $key && is_numeric($key)) { |
||
579 | $count = 0; |
||
580 | foreach ($this->children as $c) { |
||
581 | if ($tag === '*' || $tag === $c->tag) { |
||
582 | if (++$count == $key) { |
||
583 | $ret[$c->_[HDOM_INFO_BEGIN]] = 1; |
||
584 | |||
585 | return; |
||
586 | } |
||
587 | } |
||
588 | } |
||
589 | |||
590 | return; |
||
591 | } |
||
592 | |||
593 | $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; |
||
594 | if ($end == 0) { |
||
595 | $parent = $this->parent; |
||
596 | while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) { |
||
597 | $end -= 1; |
||
598 | $parent = $parent->parent; |
||
599 | } |
||
600 | $end += $parent->_[HDOM_INFO_END]; |
||
601 | } |
||
602 | |||
603 | for ($i = $this->_[HDOM_INFO_BEGIN] + 1; $i < $end; $i++) { |
||
604 | $node = $this->dom->nodes[$i]; |
||
605 | |||
606 | $pass = true; |
||
607 | |||
608 | if ($tag === '*' && !$key) { |
||
609 | if (in_array($node, $this->children, true)) { |
||
610 | $ret[$i] = 1; |
||
611 | } |
||
612 | continue; |
||
613 | } |
||
614 | |||
615 | // compare tag |
||
616 | if ($tag && $tag != $node->tag && $tag !== '*') { |
||
617 | $pass = false; |
||
618 | } |
||
619 | // compare key |
||
620 | if ($pass && $key) { |
||
621 | if ($no_key) { |
||
622 | if (isset($node->attr[$key])) { |
||
623 | $pass = false; |
||
624 | } |
||
625 | } else { |
||
626 | if (($key != 'plaintext') && !isset($node->attr[$key])) { |
||
627 | $pass = false; |
||
628 | } |
||
629 | } |
||
630 | } |
||
631 | // compare value |
||
632 | if ($pass && $key && $val && $val !== '*') { |
||
633 | // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right? |
||
634 | if ($key == 'plaintext') { |
||
635 | // $node->plaintext actually returns $node->text(); |
||
636 | $nodeKeyValue = $node->text(); |
||
637 | } else { |
||
638 | // this is a normal search, we want the value of that attribute of the tag. |
||
639 | $nodeKeyValue = $node->attr[$key]; |
||
640 | } |
||
641 | if (is_object($debugObject)) { |
||
642 | $debugObject->debugLog(2, 'testing node: '.$node->tag.' for attribute: '.$key.$exp.$val.' where nodes value is: '.$nodeKeyValue); |
||
643 | } |
||
644 | |||
645 | //PaperG - If lowercase is set, do a case insensitive test of the value of the selector. |
||
646 | if ($lowercase) { |
||
647 | $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue)); |
||
648 | } else { |
||
649 | $check = $this->match($exp, $val, $nodeKeyValue); |
||
650 | } |
||
651 | if (is_object($debugObject)) { |
||
652 | $debugObject->debugLog(2, 'after match: '.($check ? 'true' : 'false')); |
||
653 | } |
||
654 | |||
655 | // handle multiple class |
||
656 | if (!$check && strcasecmp($key, 'class') === 0) { |
||
657 | foreach (explode(' ', $node->attr[$key]) as $k) { |
||
658 | // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form. |
||
659 | if (!empty($k)) { |
||
660 | if ($lowercase) { |
||
661 | $check = $this->match($exp, strtolower($val), strtolower($k)); |
||
662 | } else { |
||
663 | $check = $this->match($exp, $val, $k); |
||
664 | } |
||
665 | if ($check) { |
||
666 | break; |
||
667 | } |
||
668 | } |
||
669 | } |
||
670 | } |
||
671 | if (!$check) { |
||
672 | $pass = false; |
||
673 | } |
||
674 | } |
||
675 | if ($pass) { |
||
676 | $ret[$i] = 1; |
||
677 | } |
||
678 | unset($node); |
||
679 | } |
||
680 | // It's passed by reference so this is actually what this function returns. |
||
681 | if (is_object($debugObject)) { |
||
682 | $debugObject->debugLog(1, 'EXIT - ret: ', $ret); |
||
683 | } |
||
684 | } |
||
685 | |||
686 | protected function match($exp, $pattern, $value) |
||
687 | { |
||
688 | global $debugObject; |
||
689 | if (is_object($debugObject)) { |
||
690 | $debugObject->debugLogEntry(1); |
||
691 | } |
||
692 | |||
693 | switch ($exp) { |
||
694 | case '=': |
||
695 | return $value === $pattern; |
||
696 | case '!=': |
||
697 | return $value !== $pattern; |
||
698 | case '^=': |
||
699 | return preg_match('/^'.preg_quote($pattern, '/').'/', $value); |
||
700 | case '$=': |
||
701 | return preg_match('/'.preg_quote($pattern, '/').'$/', $value); |
||
702 | case '*=': |
||
703 | if ($pattern[0] == '/') { |
||
704 | return preg_match($pattern, $value); |
||
705 | } |
||
706 | |||
707 | return preg_match('/'.$pattern.'/i', $value); |
||
708 | } |
||
709 | |||
710 | return false; |
||
711 | } |
||
712 | |||
713 | protected function parse_selector($selector_string) |
||
714 | { |
||
715 | global $debugObject; |
||
716 | if (is_object($debugObject)) { |
||
717 | $debugObject->debugLogEntry(1); |
||
718 | } |
||
719 | |||
720 | // pattern of CSS selectors, modified from mootools |
||
721 | // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does. |
||
722 | // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check. |
||
723 | // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured. |
||
724 | // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression. |
||
725 | // farther study is required to determine of this should be documented or removed. |
||
726 | // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; |
||
727 | $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; |
||
728 | preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); |
||
729 | if (is_object($debugObject)) { |
||
730 | $debugObject->debugLog(2, 'Matches Array: ', $matches); |
||
731 | } |
||
732 | |||
733 | $selectors = []; |
||
734 | $result = []; |
||
735 | //print_r($matches); |
||
736 | |||
737 | foreach ($matches as $m) { |
||
738 | $m[0] = trim($m[0]); |
||
739 | if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { |
||
740 | continue; |
||
741 | } |
||
742 | // for browser generated xpath |
||
743 | if ($m[1] === 'tbody') { |
||
744 | continue; |
||
745 | } |
||
746 | |||
747 | list($tag, $key, $val, $exp, $no_key) = [$m[1], null, null, '=', false]; |
||
748 | if (!empty($m[2])) { |
||
749 | $key = 'id'; |
||
750 | $val = $m[2]; |
||
751 | } |
||
752 | if (!empty($m[3])) { |
||
753 | $key = 'class'; |
||
754 | $val = $m[3]; |
||
755 | } |
||
756 | if (!empty($m[4])) { |
||
757 | $key = $m[4]; |
||
758 | } |
||
759 | if (!empty($m[5])) { |
||
760 | $exp = $m[5]; |
||
761 | } |
||
762 | if (!empty($m[6])) { |
||
763 | $val = $m[6]; |
||
764 | } |
||
765 | |||
766 | // convert to lowercase |
||
767 | if ($this->dom->lowercase) { |
||
768 | $tag = strtolower($tag); |
||
769 | $key = strtolower($key); |
||
770 | } |
||
771 | //elements that do NOT have the specified attribute |
||
772 | if (isset($key[0]) && $key[0] === '!') { |
||
773 | $key = substr($key, 1); |
||
774 | $no_key = true; |
||
775 | } |
||
776 | |||
777 | $result[] = [$tag, $key, $val, $exp, $no_key]; |
||
778 | if (trim($m[7]) === ',') { |
||
779 | $selectors[] = $result; |
||
780 | $result = []; |
||
781 | } |
||
782 | } |
||
783 | if (count($result) > 0) { |
||
784 | $selectors[] = $result; |
||
785 | } |
||
786 | |||
787 | return $selectors; |
||
788 | } |
||
789 | |||
790 | public function __get($name) |
||
791 | { |
||
792 | if (isset($this->attr[$name])) { |
||
793 | return $this->convert_text($this->attr[$name]); |
||
794 | } |
||
795 | switch ($name) { |
||
796 | case 'outertext': return $this->outertext(); |
||
797 | case 'innertext': return $this->innertext(); |
||
798 | case 'plaintext': return $this->text(); |
||
799 | case 'xmltext': return $this->xmltext(); |
||
800 | default: return array_key_exists($name, $this->attr); |
||
801 | } |
||
802 | } |
||
803 | |||
804 | public function __set($name, $value) |
||
805 | { |
||
806 | switch ($name) { |
||
807 | case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; |
||
808 | case 'innertext': |
||
809 | if (isset($this->_[HDOM_INFO_TEXT])) { |
||
810 | return $this->_[HDOM_INFO_TEXT] = $value; |
||
811 | } |
||
812 | |||
813 | return $this->_[HDOM_INFO_INNER] = $value; |
||
814 | } |
||
815 | if (!isset($this->attr[$name])) { |
||
816 | $this->_[HDOM_INFO_SPACE][] = [' ', '', '']; |
||
817 | $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; |
||
818 | } |
||
819 | $this->attr[$name] = $value; |
||
820 | } |
||
821 | |||
822 | public function __isset($name) |
||
823 | { |
||
824 | switch ($name) { |
||
825 | case 'outertext': return true; |
||
826 | case 'innertext': return true; |
||
827 | case 'plaintext': return true; |
||
828 | } |
||
829 | //no value attr: nowrap, checked selected... |
||
830 | return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); |
||
831 | } |
||
832 | |||
833 | public function __unset($name) |
||
834 | { |
||
835 | if (isset($this->attr[$name])) { |
||
836 | unset($this->attr[$name]); |
||
837 | } |
||
838 | } |
||
839 | |||
840 | // PaperG - Function to convert the text from one character set to another if the two sets are not the same. |
||
841 | public function convert_text($text) |
||
842 | { |
||
843 | global $debugObject; |
||
844 | if (is_object($debugObject)) { |
||
845 | $debugObject->debugLogEntry(1); |
||
846 | } |
||
847 | |||
848 | $converted_text = $text; |
||
849 | |||
850 | $sourceCharset = ''; |
||
851 | $targetCharset = ''; |
||
852 | |||
853 | if ($this->dom) { |
||
854 | $sourceCharset = strtoupper($this->dom->_charset); |
||
855 | $targetCharset = strtoupper($this->dom->_target_charset); |
||
856 | } |
||
857 | if (is_object($debugObject)) { |
||
858 | $debugObject->debugLog(3, 'source charset: '.$sourceCharset.' target charaset: '.$targetCharset); |
||
859 | } |
||
860 | |||
861 | if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0)) { |
||
862 | // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 |
||
863 | if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text))) { |
||
864 | $converted_text = $text; |
||
865 | } else { |
||
866 | $converted_text = iconv($sourceCharset, $targetCharset, $text); |
||
867 | } |
||
868 | } |
||
869 | |||
870 | // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. |
||
871 | if ($targetCharset == 'UTF-8') { |
||
872 | if (substr($converted_text, 0, 3) == "\xef\xbb\xbf") { |
||
873 | $converted_text = substr($converted_text, 3); |
||
874 | } |
||
875 | if (substr($converted_text, -3) == "\xef\xbb\xbf") { |
||
876 | $converted_text = substr($converted_text, 0, -3); |
||
877 | } |
||
878 | } |
||
879 | |||
880 | return $converted_text; |
||
881 | } |
||
882 | |||
883 | /** |
||
884 | * Returns true if $string is valid UTF-8 and false otherwise. |
||
885 | * |
||
886 | * @param mixed $str String to be tested |
||
887 | * |
||
888 | * @return bool |
||
889 | */ |
||
890 | public static function is_utf8($str) |
||
891 | { |
||
892 | $c = 0; |
||
893 | $b = 0; |
||
894 | $bits = 0; |
||
895 | $len = strlen($str); |
||
896 | for ($i = 0; $i < $len; $i++) { |
||
897 | $c = ord($str[$i]); |
||
898 | if ($c > 128) { |
||
899 | if (($c >= 254)) { |
||
900 | return false; |
||
901 | } elseif ($c >= 252) { |
||
902 | $bits = 6; |
||
903 | } elseif ($c >= 248) { |
||
904 | $bits = 5; |
||
905 | } elseif ($c >= 240) { |
||
906 | $bits = 4; |
||
907 | } elseif ($c >= 224) { |
||
908 | $bits = 3; |
||
909 | } elseif ($c >= 192) { |
||
910 | $bits = 2; |
||
911 | } else { |
||
912 | return false; |
||
913 | } |
||
914 | if (($i + $bits) > $len) { |
||
915 | return false; |
||
916 | } |
||
917 | while ($bits > 1) { |
||
918 | $i++; |
||
919 | $b = ord($str[$i]); |
||
920 | if ($b < 128 || $b > 191) { |
||
921 | return false; |
||
922 | } |
||
923 | $bits--; |
||
924 | } |
||
925 | } |
||
926 | } |
||
927 | |||
928 | return true; |
||
929 | } |
||
930 | |||
931 | /* |
||
932 | function is_utf8($string) |
||
933 | { |
||
934 | //this is buggy |
||
935 | return (utf8_encode(utf8_decode($string)) == $string); |
||
936 | } |
||
937 | */ |
||
938 | |||
939 | /** |
||
940 | * Function to try a few tricks to determine the displayed size of an img on the page. |
||
941 | * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types. |
||
942 | * |
||
943 | * @author John Schlick |
||
944 | * |
||
945 | * @version April 19 2012 |
||
946 | * |
||
947 | * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out. |
||
948 | */ |
||
949 | public function get_display_size() |
||
950 | { |
||
951 | global $debugObject; |
||
952 | |||
953 | $width = -1; |
||
954 | $height = -1; |
||
955 | |||
956 | if ($this->tag !== 'img') { |
||
957 | return false; |
||
958 | } |
||
959 | |||
960 | // See if there is aheight or width attribute in the tag itself. |
||
961 | if (isset($this->attr['width'])) { |
||
962 | $width = $this->attr['width']; |
||
963 | } |
||
964 | |||
965 | if (isset($this->attr['height'])) { |
||
966 | $height = $this->attr['height']; |
||
967 | } |
||
968 | |||
969 | // Now look for an inline style. |
||
970 | if (isset($this->attr['style'])) { |
||
971 | // Thanks to user gnarf from stackoverflow for this regular expression. |
||
972 | $attributes = []; |
||
973 | preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER); |
||
974 | foreach ($matches as $match) { |
||
975 | $attributes[$match[1]] = $match[2]; |
||
976 | } |
||
977 | |||
978 | // If there is a width in the style attributes: |
||
979 | if (isset($attributes['width']) && $width == -1) { |
||
980 | // check that the last two characters are px (pixels) |
||
981 | if (strtolower(substr($attributes['width'], -2)) == 'px') { |
||
982 | $proposed_width = substr($attributes['width'], 0, -2); |
||
983 | // Now make sure that it's an integer and not something stupid. |
||
984 | if (filter_var($proposed_width, FILTER_VALIDATE_INT)) { |
||
985 | $width = $proposed_width; |
||
986 | } |
||
987 | } |
||
988 | } |
||
989 | |||
990 | // If there is a width in the style attributes: |
||
991 | if (isset($attributes['height']) && $height == -1) { |
||
992 | // check that the last two characters are px (pixels) |
||
993 | if (strtolower(substr($attributes['height'], -2)) == 'px') { |
||
994 | $proposed_height = substr($attributes['height'], 0, -2); |
||
995 | // Now make sure that it's an integer and not something stupid. |
||
996 | if (filter_var($proposed_height, FILTER_VALIDATE_INT)) { |
||
997 | $height = $proposed_height; |
||
998 | } |
||
999 | } |
||
1000 | } |
||
1001 | } |
||
1002 | |||
1003 | // Future enhancement: |
||
1004 | // Look in the tag to see if there is a class or id specified that has a height or width attribute to it. |
||
1005 | |||
1006 | // Far future enhancement |
||
1007 | // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width |
||
1008 | // Note that in this case, the class or id will have the img subselector for it to apply to the image. |
||
1009 | |||
1010 | // ridiculously far future development |
||
1011 | // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page. |
||
1012 | |||
1013 | $result = ['height' => $height, |
||
1014 | 'width' => $width, ]; |
||
1015 | |||
1016 | return $result; |
||
1017 | } |
||
1018 | |||
1019 | // camel naming conventions |
||
1020 | public function getAllAttributes() |
||
1023 | } |
||
1024 | |||
1025 | public function getAttribute($name) |
||
1026 | { |
||
1027 | return html_entity_decode($this->__get($name)); |
||
1028 | } |
||
1029 | |||
1030 | public function setAttribute($name, $value) |
||
1031 | { |
||
1032 | $this->__set($name, $value); |
||
1033 | } |
||
1034 | |||
1035 | public function hasAttribute($name) |
||
1036 | { |
||
1037 | return $this->__isset($name); |
||
1038 | } |
||
1039 | |||
1040 | public function removeAttribute($name) |
||
1041 | { |
||
1042 | $this->__set($name, null); |
||
1043 | } |
||
1044 | |||
1045 | public function getElementById($id) |
||
1048 | } |
||
1049 | |||
1050 | public function getElementsById($id, $idx = null) |
||
1051 | { |
||
1052 | return $this->find("#$id", $idx); |
||
1053 | } |
||
1054 | |||
1055 | public function getElementByTagName($name) |
||
1056 | { |
||
1057 | return $this->find($name, 0); |
||
1058 | } |
||
1059 | |||
1060 | public function getElementsByTagName($name, $idx = null) |
||
1061 | { |
||
1062 | return $this->find($name, $idx); |
||
1063 | } |
||
1064 | |||
1065 | public function parentNode() |
||
1066 | { |
||
1067 | return $this->parent(); |
||
1068 | } |
||
1069 | |||
1070 | public function childNodes($idx = -1) |
||
1071 | { |
||
1072 | return $this->children($idx); |
||
1073 | } |
||
1074 | |||
1075 | public function firstChild() |
||
1076 | { |
||
1077 | return $this->first_child(); |
||
1078 | } |
||
1079 | |||
1080 | public function lastChild() |
||
1081 | { |
||
1082 | return $this->last_child(); |
||
1083 | } |
||
1084 | |||
1085 | public function nextSibling() |
||
1086 | { |
||
1087 | return $this->next_sibling(); |
||
1088 | } |
||
1089 | |||
1090 | public function previousSibling() |
||
1091 | { |
||
1092 | return $this->prev_sibling(); |
||
1093 | } |
||
1094 | |||
1095 | public function hasChildNodes() |
||
1096 | { |
||
1097 | return $this->has_child(); |
||
1098 | } |
||
1099 | |||
1100 | public function nodeName() |
||
1101 | { |
||
1102 | return $this->tag; |
||
1103 | } |
||
1104 | |||
1105 | public function appendChild($node) |
||
1110 | } |
||
1111 | } |
||
1112 | |||
1113 | /** |
||
1114 | * simple html dom parser |
||
1115 | * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector. |
||
1116 | * Paperg - change $size from protected to public so we can easily access it |
||
1117 | * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it. |
||
1118 | */ |
||
1119 | class simple_html_dom |
||
1120 | { |
||
1121 | /** @var simple_html_dom_node $root */ |
||
1122 | public $root = null; |
||
1123 | public $nodes = []; |
||
1124 | public $callback = null; |
||
1125 | public $lowercase = false; |
||
1126 | // Used to keep track of how large the text was when we started. |
||
1127 | public $original_size; |
||
1128 | public $size; |
||
1129 | protected $pos; |
||
1130 | protected $doc; |
||
1131 | protected $char; |
||
1132 | protected $cursor; |
||
1133 | protected $parent; |
||
1134 | protected $noise = []; |
||
1135 | protected $token_blank = " \t\r\n"; |
||
1136 | protected $token_equal = ' =/>'; |
||
1137 | protected $token_slash = " />\r\n\t"; |
||
1138 | protected $token_attr = ' >'; |
||
1139 | // Note that this is referenced by a child node, and so it needs to be public for that node to see this information. |
||
1140 | public $_charset = ''; |
||
1141 | public $_target_charset = ''; |
||
1142 | protected $default_br_text = ''; |
||
1143 | public $default_span_text = ''; |
||
1144 | |||
1145 | // use isset instead of in_array, performance boost about 30%... |
||
1146 | protected $self_closing_tags = ['img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1]; |
||
1147 | protected $block_tags = ['root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1]; |
||
1148 | // Known sourceforge issue #2977341 |
||
1149 | // B tags that are not closed cause us to return everything to the end of the document. |
||
1150 | protected $optional_closing_tags = [ |
||
1151 | 'tr' => ['tr'=>1, 'td'=>1, 'th'=>1], |
||
1152 | 'th' => ['th'=>1], |
||
1153 | 'td' => ['td'=>1], |
||
1941 |
This check looks for parameters that have been defined for a function or method, but which are not used in the method body.