1 | <?php |
||||
2 | |||||
3 | namespace simplehtmldom; |
||||
4 | |||||
5 | /** |
||||
6 | * Website: http://sourceforge.net/projects/simplehtmldom/ |
||||
7 | * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/). |
||||
8 | * |
||||
9 | * Licensed under The MIT License |
||||
10 | * See the LICENSE file in the project root for more information. |
||||
11 | * |
||||
12 | * Authors: |
||||
13 | * S.C. Chen |
||||
14 | * John Schlick |
||||
15 | * Rus Carroll |
||||
16 | * logmanoriginal |
||||
17 | * |
||||
18 | * Contributors: |
||||
19 | * Yousuke Kumakura |
||||
20 | * Vadim Voituk |
||||
21 | * Antcs |
||||
22 | * |
||||
23 | * Version Rev. 2.0-RC2 (415) |
||||
24 | */ |
||||
25 | include_once __DIR__ . '/constants.php'; |
||||
26 | include_once __DIR__ . '/Debug.php'; |
||||
27 | |||||
28 | /** |
||||
29 | * HTMLNode class |
||||
30 | * @property string $innertext |
||||
31 | * @property string|null $title |
||||
32 | * @property string|null $alt |
||||
33 | * @property string|null $src |
||||
34 | * @property string|null $href |
||||
35 | * @property string|null $async |
||||
36 | * @property string|null $defer |
||||
37 | */ |
||||
38 | class HtmlNode |
||||
39 | { |
||||
40 | const HDOM_TYPE_ELEMENT = 1; |
||||
41 | const HDOM_TYPE_COMMENT = 2; |
||||
42 | const HDOM_TYPE_TEXT = 3; |
||||
43 | const HDOM_TYPE_ROOT = 5; |
||||
44 | const HDOM_TYPE_UNKNOWN = 6; |
||||
45 | const HDOM_TYPE_CDATA = 7; |
||||
46 | |||||
47 | const HDOM_QUOTE_DOUBLE = 0; |
||||
48 | const HDOM_QUOTE_SINGLE = 1; |
||||
49 | const HDOM_QUOTE_NO = 3; |
||||
50 | |||||
51 | const HDOM_INFO_BEGIN = 0; |
||||
52 | const HDOM_INFO_END = 1; |
||||
53 | const HDOM_INFO_QUOTE = 2; |
||||
54 | const HDOM_INFO_SPACE = 3; |
||||
55 | const HDOM_INFO_TEXT = 4; |
||||
56 | const HDOM_INFO_INNER = 5; |
||||
57 | const HDOM_INFO_OUTER = 6; |
||||
58 | const HDOM_INFO_ENDSPACE = 7; |
||||
59 | |||||
60 | public $nodetype = self::HDOM_TYPE_TEXT; |
||||
61 | public $tag = 'text'; |
||||
62 | public $attr = []; |
||||
63 | public $children = []; |
||||
64 | public $nodes = []; |
||||
65 | public $parent = null; |
||||
66 | public $_ = []; |
||||
67 | private $dom = null; |
||||
68 | |||||
69 | public function __call($func, $args) |
||||
70 | { |
||||
71 | // Allow users to call methods with lower_case syntax |
||||
72 | switch ($func) { |
||||
73 | case 'children': |
||||
74 | $actual_function = 'childNodes'; |
||||
75 | break; |
||||
76 | case 'first_child': |
||||
77 | $actual_function = 'firstChild'; |
||||
78 | break; |
||||
79 | case 'has_child': |
||||
80 | $actual_function = 'hasChildNodes'; |
||||
81 | break; |
||||
82 | case 'last_child': |
||||
83 | $actual_function = 'lastChild'; |
||||
84 | break; |
||||
85 | case 'next_sibling': |
||||
86 | $actual_function = 'nextSibling'; |
||||
87 | break; |
||||
88 | case 'prev_sibling': |
||||
89 | $actual_function = 'previousSibling'; |
||||
90 | break; |
||||
91 | default: |
||||
92 | trigger_error( |
||||
93 | 'Call to undefined method ' . __CLASS__ . '::' . $func . '()', |
||||
94 | E_USER_ERROR |
||||
95 | ); |
||||
96 | } |
||||
97 | |||||
98 | // phpcs:ignore Generic.Files.LineLength |
||||
99 | Debug::log(__CLASS__ . '->' . $func . '() has been deprecated and will be removed in the next major version of simplehtmldom. Use ' . __CLASS__ . '->' . $actual_function . '() instead.'); |
||||
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
![]() |
|||||
100 | |||||
101 | return call_user_func_array([$this, $actual_function], $args); |
||||
102 | } |
||||
103 | |||||
104 | public function __construct($dom) |
||||
105 | { |
||||
106 | if (null === $dom) { |
||||
107 | return $this; |
||||
108 | } |
||||
109 | |||||
110 | $this->dom = $dom; |
||||
111 | $dom->nodes[] = $this; |
||||
112 | } |
||||
113 | |||||
114 | public function __debugInfo() |
||||
115 | { |
||||
116 | // Translate node type to human-readable form |
||||
117 | switch ($this->nodetype) { |
||||
118 | case self::HDOM_TYPE_ELEMENT: |
||||
119 | $nodetype = "HDOM_TYPE_ELEMENT ($this->nodetype)"; |
||||
120 | break; |
||||
121 | case self::HDOM_TYPE_COMMENT: |
||||
122 | $nodetype = "HDOM_TYPE_COMMENT ($this->nodetype)"; |
||||
123 | break; |
||||
124 | case self::HDOM_TYPE_TEXT: |
||||
125 | $nodetype = "HDOM_TYPE_TEXT ($this->nodetype)"; |
||||
126 | break; |
||||
127 | case self::HDOM_TYPE_ROOT: |
||||
128 | $nodetype = "HDOM_TYPE_ROOT ($this->nodetype)"; |
||||
129 | break; |
||||
130 | case self::HDOM_TYPE_CDATA: |
||||
131 | $nodetype = "HDOM_TYPE_CDATA ($this->nodetype)"; |
||||
132 | break; |
||||
133 | case self::HDOM_TYPE_UNKNOWN: |
||||
134 | default: |
||||
135 | $nodetype = "HDOM_TYPE_UNKNOWN ($this->nodetype)"; |
||||
136 | } |
||||
137 | |||||
138 | return [ |
||||
139 | 'nodetype' => $nodetype, |
||||
140 | 'tag' => $this->tag, |
||||
141 | 'attributes' => empty($this->attr) ? 'none' : $this->attr, |
||||
142 | 'nodes' => empty($this->nodes) ? 'none' : $this->nodes, |
||||
143 | ]; |
||||
144 | } |
||||
145 | |||||
146 | public function __toString() |
||||
147 | { |
||||
148 | return $this->outertext(); |
||||
149 | } |
||||
150 | |||||
151 | public function clear() |
||||
152 | { |
||||
153 | unset($this->dom, $this->parent); // Break link to origin |
||||
154 | // Break link to branch |
||||
155 | } |
||||
156 | |||||
157 | /** @codeCoverageIgnore */ |
||||
158 | public function dump($show_attr = true, $depth = 0) |
||||
159 | { |
||||
160 | echo str_repeat("\t", $depth) . $this->tag; |
||||
161 | |||||
162 | if ($show_attr && count($this->attr) > 0) { |
||||
163 | echo '('; |
||||
164 | foreach ($this->attr as $k => $v) { |
||||
165 | echo "[$k]=>\"$v\", "; |
||||
166 | } |
||||
167 | echo ')'; |
||||
168 | } |
||||
169 | |||||
170 | echo "\n"; |
||||
171 | |||||
172 | if ($this->nodes) { |
||||
0 ignored issues
–
show
The expression
$this->nodes of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent. Consider making the comparison explicit by using ![]() |
|||||
173 | foreach ($this->nodes as $node) { |
||||
174 | $node->dump($show_attr, $depth + 1); |
||||
175 | } |
||||
176 | } |
||||
177 | } |
||||
178 | |||||
179 | /** @codeCoverageIgnore */ |
||||
180 | public function dump_node($echo = true) |
||||
181 | { |
||||
182 | $string = $this->tag; |
||||
183 | |||||
184 | if (count($this->attr) > 0) { |
||||
185 | $string .= '('; |
||||
186 | foreach ($this->attr as $k => $v) { |
||||
187 | $string .= "[$k]=>\"$v\", "; |
||||
188 | } |
||||
189 | $string .= ')'; |
||||
190 | } |
||||
191 | |||||
192 | if (count($this->_) > 0) { |
||||
193 | $string .= ' $_ ('; |
||||
194 | foreach ($this->_ as $k => $v) { |
||||
195 | if (is_array($v)) { |
||||
196 | $string .= "[$k]=>("; |
||||
197 | foreach ($v as $k2 => $v2) { |
||||
198 | $string .= "[$k2]=>\"$v2\", "; |
||||
199 | } |
||||
200 | $string .= ')'; |
||||
201 | } else { |
||||
202 | $string .= "[$k]=>\"$v\", "; |
||||
203 | } |
||||
204 | } |
||||
205 | $string .= ')'; |
||||
206 | } |
||||
207 | |||||
208 | if (isset($this->text)) { |
||||
0 ignored issues
–
show
The property
text does not exist on simplehtmldom\HtmlNode . Since you implemented __get , consider adding a @property annotation.
![]() |
|||||
209 | $string .= " text: ({$this->text})"; |
||||
210 | } |
||||
211 | |||||
212 | $string .= ' HDOM_INNER_INFO: '; |
||||
213 | /** |
||||
214 | * @var mixed |
||||
215 | */ |
||||
216 | if (isset($node)) { |
||||
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
|
|||||
217 | if (isset($node->_[self::HDOM_INFO_INNER])) { |
||||
218 | $string .= "'" . $node->_[self::HDOM_INFO_INNER] . "'"; |
||||
219 | } else { |
||||
220 | $string .= ' NULL '; |
||||
221 | } |
||||
222 | } |
||||
223 | |||||
224 | $string .= ' children: ' . count($this->children); |
||||
225 | $string .= ' nodes: ' . count($this->nodes); |
||||
226 | $string .= "\n"; |
||||
227 | |||||
228 | if ($echo) { |
||||
229 | echo $string; |
||||
230 | |||||
231 | return; |
||||
232 | } else { |
||||
233 | return $string; |
||||
234 | } |
||||
235 | } |
||||
236 | |||||
237 | public function parent($parent = null) |
||||
238 | { |
||||
239 | // I am SURE that this doesn't work properly. |
||||
240 | // It fails to unset the current node from it's current parents nodes or |
||||
241 | // children list first. |
||||
242 | if (null !== $parent) { |
||||
243 | $this->parent = $parent; |
||||
244 | $this->parent->nodes[] = $this; |
||||
245 | $this->parent->children[] = $this; |
||||
246 | } |
||||
247 | |||||
248 | return $this->parent; |
||||
249 | } |
||||
250 | |||||
251 | public function find_ancestor_tag($tag) |
||||
252 | { |
||||
253 | if (null === $this->parent) { |
||||
254 | return null; |
||||
255 | } |
||||
256 | |||||
257 | $ancestor = $this->parent; |
||||
258 | |||||
259 | while (!is_null($ancestor)) { |
||||
260 | if ($ancestor->tag === $tag) { |
||||
261 | break; |
||||
262 | } |
||||
263 | |||||
264 | $ancestor = $ancestor->parent; |
||||
265 | } |
||||
266 | |||||
267 | return $ancestor; |
||||
268 | } |
||||
269 | |||||
270 | public function innertext() |
||||
271 | { |
||||
272 | if (isset($this->_[self::HDOM_INFO_INNER])) { |
||||
273 | $ret = $this->_[self::HDOM_INFO_INNER]; |
||||
274 | } elseif (isset($this->_[self::HDOM_INFO_TEXT])) { |
||||
275 | $ret = $this->_[self::HDOM_INFO_TEXT]; |
||||
276 | } else { |
||||
277 | $ret = ''; |
||||
278 | } |
||||
279 | |||||
280 | foreach ($this->nodes as $n) { |
||||
281 | $ret .= $n->outertext(); |
||||
282 | } |
||||
283 | |||||
284 | return $this->convert_text($ret); |
||||
285 | } |
||||
286 | |||||
287 | public function outertext() |
||||
288 | { |
||||
289 | if ('root' === $this->tag) { |
||||
290 | return $this->innertext(); |
||||
291 | } |
||||
292 | |||||
293 | // todo: What is the use of this callback? Remove? |
||||
294 | if ($this->dom && null !== $this->dom->callback) { |
||||
295 | call_user_func_array($this->dom->callback, [$this]); |
||||
296 | } |
||||
297 | |||||
298 | if (isset($this->_[self::HDOM_INFO_OUTER])) { |
||||
299 | return $this->convert_text($this->_[self::HDOM_INFO_OUTER]); |
||||
300 | } |
||||
301 | |||||
302 | if (isset($this->_[self::HDOM_INFO_TEXT])) { |
||||
303 | return $this->convert_text($this->_[self::HDOM_INFO_TEXT]); |
||||
304 | } |
||||
305 | |||||
306 | $ret = ''; |
||||
307 | |||||
308 | if (isset($this->_[self::HDOM_INFO_BEGIN])) { |
||||
309 | $ret = $this->makeup(); |
||||
310 | } |
||||
311 | |||||
312 | if (isset($this->_[self::HDOM_INFO_INNER])) { |
||||
313 | // todo: <br> should either never have self::HDOM_INFO_INNER or always |
||||
314 | if ('br' !== $this->tag) { |
||||
315 | $ret .= $this->_[self::HDOM_INFO_INNER]; |
||||
316 | } |
||||
317 | } |
||||
318 | |||||
319 | if ($this->nodes) { |
||||
0 ignored issues
–
show
The expression
$this->nodes of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent. Consider making the comparison explicit by using ![]() |
|||||
320 | foreach ($this->nodes as $n) { |
||||
321 | $ret .= $n->outertext(); |
||||
322 | } |
||||
323 | } |
||||
324 | |||||
325 | if (isset($this->_[self::HDOM_INFO_END]) && 0 != $this->_[self::HDOM_INFO_END]) { |
||||
326 | $ret .= '</' . $this->tag . '>'; |
||||
327 | } |
||||
328 | |||||
329 | return $this->convert_text($ret); |
||||
330 | } |
||||
331 | |||||
332 | /** |
||||
333 | * Returns true if the provided element is a block level element. |
||||
334 | * |
||||
335 | * @see https://www.w3resource.com/html/HTML-block-level-and-inline-elements.php |
||||
336 | */ |
||||
337 | protected function is_block_element($node) |
||||
338 | { |
||||
339 | // todo: When we have the utility class this should be moved there |
||||
340 | return in_array(strtolower($node->tag), [ |
||||
341 | 'p', |
||||
342 | 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', |
||||
343 | 'ol', 'ul', |
||||
344 | 'pre', |
||||
345 | 'address', |
||||
346 | 'blockquote', |
||||
347 | 'dl', |
||||
348 | 'div', |
||||
349 | 'fieldset', |
||||
350 | 'form', |
||||
351 | 'hr', |
||||
352 | 'noscript', |
||||
353 | 'table', |
||||
354 | ]); |
||||
355 | } |
||||
356 | |||||
357 | /** |
||||
358 | * Returns true if the provided element is an inline level element. |
||||
359 | * |
||||
360 | * @see https://www.w3resource.com/html/HTML-block-level-and-inline-elements.php |
||||
361 | */ |
||||
362 | protected function is_inline_element($node) |
||||
363 | { |
||||
364 | // todo: When we have the utility class this should be moved there |
||||
365 | return in_array(strtolower($node->tag), [ |
||||
366 | 'b', 'big', 'i', 'small', 'tt', |
||||
367 | 'abbr', 'acronym', 'cite', 'code', 'dfn', 'em', 'kbd', 'strong', 'samp', 'var', |
||||
368 | 'a', 'bdo', 'br', 'img', 'map', 'object', 'q', 'script', 'span', 'sub', 'sup', |
||||
369 | 'button', 'input', 'label', 'select', 'textarea', |
||||
370 | ]); |
||||
371 | } |
||||
372 | |||||
373 | public function text($trim = true) |
||||
374 | { |
||||
375 | $ret = ''; |
||||
376 | |||||
377 | if ('script' === strtolower($this->tag)) { |
||||
378 | $ret = ''; |
||||
379 | } elseif ('style' === strtolower($this->tag)) { |
||||
380 | $ret = ''; |
||||
381 | } elseif (self::HDOM_TYPE_COMMENT === $this->nodetype) { |
||||
382 | $ret = ''; |
||||
383 | } elseif (self::HDOM_TYPE_CDATA === $this->nodetype) { |
||||
384 | $ret = $this->_[self::HDOM_INFO_INNER]; |
||||
385 | } elseif (self::HDOM_TYPE_UNKNOWN === $this->nodetype) { |
||||
386 | $ret = ''; |
||||
387 | } elseif (isset($this->_[self::HDOM_INFO_INNER])) { |
||||
388 | $ret = $this->_[self::HDOM_INFO_INNER]; |
||||
389 | } elseif (self::HDOM_TYPE_TEXT === $this->nodetype) { |
||||
390 | $ret = $this->_[self::HDOM_INFO_TEXT]; |
||||
391 | } |
||||
392 | |||||
393 | if (is_null($this->nodes)) { |
||||
0 ignored issues
–
show
|
|||||
394 | return ''; |
||||
395 | } |
||||
396 | |||||
397 | foreach ($this->nodes as $n) { |
||||
398 | if ($this->is_block_element($n)) { |
||||
399 | $block = ltrim($this->convert_text($n->text(false))); |
||||
400 | |||||
401 | if (empty($block)) { |
||||
402 | continue; |
||||
403 | } |
||||
404 | |||||
405 | $ret = rtrim($ret) . "\n\n" . $block; |
||||
406 | } elseif ($this->is_inline_element($n)) { |
||||
407 | // todo: <br> introduces code smell because no space but \n |
||||
408 | if ('br' === strtolower($n->tag)) { |
||||
409 | $ret .= $this->dom->default_br_text ?: DEFAULT_BR_TEXT; |
||||
410 | } else { |
||||
411 | $inline = ltrim($this->convert_text($n->text(false))); |
||||
412 | |||||
413 | if (empty($inline)) { |
||||
414 | continue; |
||||
415 | } |
||||
416 | |||||
417 | $ret = $ret . $this->convert_text($n->text(false)); |
||||
418 | } |
||||
419 | } else { |
||||
420 | $ret .= $this->convert_text($n->text(false)); |
||||
421 | } |
||||
422 | } |
||||
423 | |||||
424 | // Reduce whitespace at start/end to a single (or none) space |
||||
425 | $ret = preg_replace('/[ \t\n\r\0\x0B\xC2\xA0]+$/u', $trim ? '' : ' ', $ret); |
||||
426 | $ret = preg_replace('/^[ \t\n\r\0\x0B\xC2\xA0]+/u', $trim ? '' : ' ', $ret); |
||||
427 | |||||
428 | return $ret; |
||||
429 | } |
||||
430 | |||||
431 | public function xmltext() |
||||
432 | { |
||||
433 | $ret = $this->innertext(); |
||||
434 | $ret = str_ireplace('<![CDATA[', '', $ret); |
||||
435 | $ret = str_replace(']]>', '', $ret); |
||||
436 | |||||
437 | return $ret; |
||||
438 | } |
||||
439 | |||||
440 | public function makeup() |
||||
441 | { |
||||
442 | // text, comment, unknown |
||||
443 | if (isset($this->_[self::HDOM_INFO_TEXT])) { |
||||
444 | return $this->_[self::HDOM_INFO_TEXT]; |
||||
445 | } |
||||
446 | |||||
447 | $ret = '<' . $this->tag; |
||||
448 | |||||
449 | foreach ($this->attr as $key => $val) { |
||||
450 | // skip removed attribute |
||||
451 | if (null === $val || false === $val) { |
||||
452 | continue; |
||||
453 | } |
||||
454 | |||||
455 | if (isset($this->_[self::HDOM_INFO_SPACE][$key])) { |
||||
456 | $ret .= $this->_[self::HDOM_INFO_SPACE][$key][0]; |
||||
457 | } else { |
||||
458 | $ret .= ' '; |
||||
459 | } |
||||
460 | |||||
461 | //no value attr: nowrap, checked selected... |
||||
462 | if (true === $val) { |
||||
463 | $ret .= $key; |
||||
464 | } else { |
||||
465 | if (isset($this->_[self::HDOM_INFO_QUOTE][$key])) { |
||||
466 | $quote_type = $this->_[self::HDOM_INFO_QUOTE][$key]; |
||||
467 | } else { |
||||
468 | $quote_type = self::HDOM_QUOTE_DOUBLE; |
||||
469 | } |
||||
470 | |||||
471 | switch ($quote_type) { |
||||
472 | case self::HDOM_QUOTE_SINGLE: |
||||
473 | $quote = '\''; |
||||
474 | $val = htmlentities($val, ENT_QUOTES, $this->dom->target_charset); |
||||
475 | break; |
||||
476 | case self::HDOM_QUOTE_NO: |
||||
477 | $quote = ''; |
||||
478 | break; |
||||
479 | case self::HDOM_QUOTE_DOUBLE: |
||||
480 | default: |
||||
481 | $quote = '"'; |
||||
482 | $val = htmlentities($val, ENT_COMPAT, $this->dom->target_charset); |
||||
483 | } |
||||
484 | |||||
485 | $ret .= $key |
||||
486 | . (isset($this->_[self::HDOM_INFO_SPACE][$key]) ? $this->_[self::HDOM_INFO_SPACE][$key][1] : '') |
||||
487 | . '=' |
||||
488 | . (isset($this->_[self::HDOM_INFO_SPACE][$key]) ? $this->_[self::HDOM_INFO_SPACE][$key][2] : '') |
||||
489 | . $quote |
||||
490 | . $val |
||||
491 | . $quote; |
||||
492 | } |
||||
493 | } |
||||
494 | |||||
495 | if (isset($this->_[self::HDOM_INFO_ENDSPACE])) { |
||||
496 | $ret .= $this->_[self::HDOM_INFO_ENDSPACE]; |
||||
497 | } |
||||
498 | |||||
499 | return $ret . '>'; |
||||
500 | } |
||||
501 | |||||
502 | /** |
||||
503 | * Element selector |
||||
504 | * |
||||
505 | * @param string $selector |
||||
506 | * @param int $idx |
||||
507 | * @param boolean $lowercase |
||||
508 | * @return HtmlNode |
||||
509 | */ |
||||
510 | public function find($selector, $idx = null, $lowercase = false) |
||||
511 | { |
||||
512 | $selectors = $this->parse_selector($selector); |
||||
513 | if (0 === ($count = count($selectors))) { |
||||
514 | return []; |
||||
0 ignored issues
–
show
|
|||||
515 | } |
||||
516 | $found_keys = []; |
||||
517 | |||||
518 | // find each selector |
||||
519 | for ($c = 0; $c < $count; ++$c) { |
||||
520 | // The change on the below line was documented on the sourceforge |
||||
521 | // code tracker id 2788009 |
||||
522 | // used to be: if (($levle=count($selectors[0]))===0) return array(); |
||||
523 | if (0 === ($levle = count($selectors[$c]))) { |
||||
524 | Debug::log_once('Empty selector (' . $selector . ') matches nothing.'); |
||||
525 | |||||
526 | return []; |
||||
0 ignored issues
–
show
|
|||||
527 | } |
||||
528 | |||||
529 | if (!isset($this->_[self::HDOM_INFO_BEGIN])) { |
||||
530 | Debug::log_once('Invalid operation. The current node has no start tag.'); |
||||
531 | |||||
532 | return []; |
||||
0 ignored issues
–
show
|
|||||
533 | } |
||||
534 | |||||
535 | $head = [$this->_[self::HDOM_INFO_BEGIN] => 1]; |
||||
536 | $cmd = ' '; // Combinator |
||||
537 | |||||
538 | // handle descendant selectors, no recursive! |
||||
539 | for ($l = 0; $l < $levle; ++$l) { |
||||
540 | $ret = []; |
||||
541 | |||||
542 | foreach ($head as $k => $v) { |
||||
543 | $n = (-1 === $k) ? $this->dom->root : $this->dom->nodes[$k]; |
||||
544 | //PaperG - Pass this optional parameter on to the seek function. |
||||
545 | $n->seek($selectors[$c][$l], $ret, $cmd, $lowercase); |
||||
546 | } |
||||
547 | |||||
548 | $head = $ret; |
||||
549 | $cmd = $selectors[$c][$l][6]; // Next Combinator |
||||
550 | } |
||||
551 | |||||
552 | foreach ($head as $k => $v) { |
||||
553 | if (!isset($found_keys[$k])) { |
||||
554 | $found_keys[$k] = 1; |
||||
555 | } |
||||
556 | } |
||||
557 | } |
||||
558 | |||||
559 | // sort keys |
||||
560 | ksort($found_keys); |
||||
561 | |||||
562 | $found = []; |
||||
563 | foreach ($found_keys as $k => $v) { |
||||
564 | $found[] = $this->dom->nodes[$k]; |
||||
565 | } |
||||
566 | |||||
567 | // return nth-element or array |
||||
568 | if (is_null($idx)) { |
||||
569 | return $found; |
||||
0 ignored issues
–
show
|
|||||
570 | } elseif ($idx < 0) { |
||||
571 | $idx = count($found) + $idx; |
||||
572 | } |
||||
573 | |||||
574 | return (isset($found[$idx])) ? $found[$idx] : null; |
||||
575 | } |
||||
576 | |||||
577 | public function expect($selector, $idx = null, $lowercase = false) |
||||
578 | { |
||||
579 | return $this->find($selector, $idx, $lowercase) ?: null; |
||||
580 | } |
||||
581 | |||||
582 | protected function seek($selector, &$ret, $parent_cmd, $lowercase = false) |
||||
583 | { |
||||
584 | list($ps_selector, $tag, $ps_element, $id, $class, $attributes, $cmb) = $selector; |
||||
585 | $nodes = []; |
||||
586 | |||||
587 | if (' ' === $parent_cmd) { // Descendant Combinator |
||||
588 | // Find parent closing tag if the current element doesn't have a closing |
||||
589 | // tag (i.e. void element) |
||||
590 | $end = (!empty($this->_[self::HDOM_INFO_END])) ? $this->_[self::HDOM_INFO_END] : 0; |
||||
591 | if (0 == $end && $this->parent) { |
||||
592 | $parent = $this->parent; |
||||
593 | while (null !== $parent && !isset($parent->_[self::HDOM_INFO_END])) { |
||||
594 | --$end; |
||||
595 | $parent = $parent->parent; |
||||
596 | } |
||||
597 | $end += $parent->_[self::HDOM_INFO_END]; |
||||
598 | } |
||||
599 | |||||
600 | if (0 === $end) { |
||||
601 | $end = count($this->dom->nodes); |
||||
602 | } |
||||
603 | |||||
604 | // Get list of target nodes |
||||
605 | $nodes_start = $this->_[self::HDOM_INFO_BEGIN] + 1; |
||||
606 | |||||
607 | // remove() makes $this->dom->nodes non-contiguous; use what is left. |
||||
608 | $nodes = array_intersect_key( |
||||
609 | $this->dom->nodes, |
||||
610 | array_flip(range($nodes_start, $end)) |
||||
611 | ); |
||||
612 | } elseif ('>' === $parent_cmd) { // Child Combinator |
||||
613 | $nodes = $this->children; |
||||
614 | } elseif ( |
||||
615 | '+' === $parent_cmd |
||||
616 | && $this->parent |
||||
617 | && in_array($this, $this->parent->children) |
||||
618 | ) { // Next-Sibling Combinator |
||||
619 | $index = array_search($this, $this->parent->children, true) + 1; |
||||
620 | if ($index < count($this->parent->children)) { |
||||
621 | $nodes[] = $this->parent->children[$index]; |
||||
622 | } |
||||
623 | } elseif ( |
||||
624 | '~' === $parent_cmd |
||||
625 | && $this->parent |
||||
626 | && in_array($this, $this->parent->children) |
||||
627 | ) { // Subsequent Sibling Combinator |
||||
628 | $index = array_search($this, $this->parent->children, true); |
||||
629 | $nodes = array_slice($this->parent->children, $index); |
||||
0 ignored issues
–
show
It seems like
$index can also be of type string ; however, parameter $offset of array_slice() does only seem to accept integer , maybe add an additional type check?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
![]() |
|||||
630 | } |
||||
631 | |||||
632 | // Go throgh each element starting at this element until the end tag |
||||
633 | // Note: If this element is a void tag, any previous void element is |
||||
634 | // skipped. |
||||
635 | foreach ($nodes as $node) { |
||||
636 | $pass = true; |
||||
637 | |||||
638 | // Skip root nodes |
||||
639 | if (!$node->parent) { |
||||
640 | unset($node); |
||||
641 | continue; |
||||
642 | } |
||||
643 | |||||
644 | // Handle 'text' selector |
||||
645 | if ($pass && 'text' === $tag) { |
||||
646 | if ('text' === $node->tag) { |
||||
647 | $ret[array_search($node, $this->dom->nodes, true)] = 1; |
||||
648 | } |
||||
649 | |||||
650 | if (isset($node->_[self::HDOM_INFO_INNER])) { |
||||
651 | $ret[$node->_[self::HDOM_INFO_BEGIN]] = 1; |
||||
652 | } |
||||
653 | |||||
654 | unset($node); |
||||
655 | continue; |
||||
656 | } |
||||
657 | |||||
658 | // Handle 'cdata' selector |
||||
659 | if ($pass && 'cdata' === $tag) { |
||||
660 | if ('cdata' === $node->tag) { |
||||
661 | $ret[$node->_[self::HDOM_INFO_BEGIN]] = 1; |
||||
662 | } |
||||
663 | |||||
664 | unset($node); |
||||
665 | continue; |
||||
666 | } |
||||
667 | |||||
668 | // Handle 'comment' |
||||
669 | if ($pass && 'comment' === $tag && 'comment' === $node->tag) { |
||||
670 | $ret[$node->_[self::HDOM_INFO_BEGIN]] = 1; |
||||
671 | unset($node); |
||||
672 | continue; |
||||
673 | } |
||||
674 | |||||
675 | // Skip if node isn't a child node (i.e. text nodes) |
||||
676 | if ($pass && !in_array($node, $node->parent->children, true)) { |
||||
677 | unset($node); |
||||
678 | continue; |
||||
679 | } |
||||
680 | |||||
681 | // Skip if tag doesn't match |
||||
682 | if ($pass && '' !== $tag && $tag !== $node->tag && '*' !== $tag) { |
||||
683 | $pass = false; |
||||
684 | } |
||||
685 | |||||
686 | // Skip if ID doesn't exist |
||||
687 | if ($pass && '' !== $id && !isset($node->attr['id'])) { |
||||
688 | $pass = false; |
||||
689 | } |
||||
690 | |||||
691 | // Check if ID matches |
||||
692 | if ($pass && '' !== $id && isset($node->attr['id'])) { |
||||
693 | // Note: Only consider the first ID (as browsers do) |
||||
694 | $node_id = explode(' ', trim($node->attr['id']))[0]; |
||||
695 | |||||
696 | if ($id !== $node_id) { |
||||
697 | $pass = false; |
||||
698 | } |
||||
699 | } |
||||
700 | |||||
701 | // Check if all class(es) exist |
||||
702 | if ($pass && '' !== $class && is_array($class) && !empty($class)) { |
||||
703 | if (isset($node->attr['class'])) { |
||||
704 | // Apply the same rules for the pattern and attribute value |
||||
705 | // Attribute values must not contain control characters other than space |
||||
706 | // https://www.w3.org/TR/html/dom.html#text-content |
||||
707 | // https://www.w3.org/TR/html/syntax.html#attribute-values |
||||
708 | // https://www.w3.org/TR/xml/#AVNormalize |
||||
709 | $node_classes = preg_replace("/[\r\n\t\s]+/u", ' ', $node->attr['class']); |
||||
710 | $node_classes = trim($node_classes); |
||||
711 | $node_classes = explode(' ', $node_classes); |
||||
712 | |||||
713 | if ($lowercase) { |
||||
714 | $node_classes = array_map('strtolower', $node_classes); |
||||
715 | } |
||||
716 | |||||
717 | foreach ($class as $c) { |
||||
718 | if (!in_array($c, $node_classes)) { |
||||
719 | $pass = false; |
||||
720 | break; |
||||
721 | } |
||||
722 | } |
||||
723 | } else { |
||||
724 | $pass = false; |
||||
725 | } |
||||
726 | } |
||||
727 | |||||
728 | // Check attributes |
||||
729 | if ( |
||||
730 | $pass |
||||
731 | && '' !== $attributes |
||||
732 | && is_array($attributes) |
||||
733 | && !empty($attributes) |
||||
734 | ) { |
||||
735 | foreach ($attributes as $a) { |
||||
736 | list( |
||||
737 | $att_name, |
||||
738 | $att_expr, |
||||
739 | $att_val, |
||||
740 | $att_inv, |
||||
741 | $att_case_sensitivity |
||||
742 | ) = $a; |
||||
743 | |||||
744 | // Handle indexing attributes (i.e. "[2]") |
||||
745 | /* |
||||
746 | * Note: This is not supported by the CSS Standard but adds |
||||
747 | * the ability to select items compatible to XPath (i.e. |
||||
748 | * the 3rd element within it's parent). |
||||
749 | * |
||||
750 | * Note: This doesn't conflict with the CSS Standard which |
||||
751 | * doesn't work on numeric attributes anyway. |
||||
752 | */ |
||||
753 | if ( |
||||
754 | is_numeric($att_name) |
||||
755 | && '' === $att_expr |
||||
756 | && '' === $att_val |
||||
757 | ) { |
||||
758 | $count = 0; |
||||
759 | |||||
760 | // Find index of current element in parent |
||||
761 | foreach ($node->parent->children as $c) { |
||||
762 | if ($c->tag === $node->tag) { |
||||
763 | ++$count; |
||||
764 | } |
||||
765 | if ($c === $node) { |
||||
766 | break; |
||||
767 | } |
||||
768 | } |
||||
769 | |||||
770 | // If this is the correct node, continue with next |
||||
771 | // attribute |
||||
772 | if ($count === (int) $att_name) { |
||||
773 | continue; |
||||
774 | } |
||||
775 | } |
||||
776 | |||||
777 | // Check attribute availability |
||||
778 | if ($att_inv) { // Attribute should NOT be set |
||||
779 | if (isset($node->attr[$att_name])) { |
||||
780 | $pass = false; |
||||
781 | break; |
||||
782 | } |
||||
783 | } else { // Attribute should be set |
||||
784 | // todo: "plaintext" is not a valid CSS selector! |
||||
785 | if ( |
||||
786 | 'plaintext' !== $att_name |
||||
787 | && !isset($node->attr[$att_name]) |
||||
788 | ) { |
||||
789 | $pass = false; |
||||
790 | break; |
||||
791 | } |
||||
792 | } |
||||
793 | |||||
794 | // Continue with next attribute if expression isn't defined |
||||
795 | if ('' === $att_expr) { |
||||
796 | continue; |
||||
797 | } |
||||
798 | |||||
799 | // If they have told us that this is a "plaintext" |
||||
800 | // search then we want the plaintext of the node - right? |
||||
801 | // todo "plaintext" is not a valid CSS selector! |
||||
802 | if ('plaintext' === $att_name) { |
||||
803 | $nodeKeyValue = $node->text(); |
||||
804 | } else { |
||||
805 | $nodeKeyValue = $node->attr[$att_name]; |
||||
806 | } |
||||
807 | |||||
808 | // If lowercase is set, do a case insensitive test of |
||||
809 | // the value of the selector. |
||||
810 | if ($lowercase) { |
||||
811 | $check = $this->match( |
||||
812 | $att_expr, |
||||
813 | strtolower($att_val), |
||||
814 | strtolower($nodeKeyValue), |
||||
815 | $att_case_sensitivity |
||||
816 | ); |
||||
817 | } else { |
||||
818 | $check = $this->match( |
||||
819 | $att_expr, |
||||
820 | $att_val, |
||||
821 | $nodeKeyValue, |
||||
822 | $att_case_sensitivity |
||||
823 | ); |
||||
824 | } |
||||
825 | |||||
826 | $check = 'not' === $ps_element ? !$check : $check; |
||||
827 | |||||
828 | if (!$check) { |
||||
829 | $pass = false; |
||||
830 | break; |
||||
831 | } |
||||
832 | } |
||||
833 | } |
||||
834 | |||||
835 | // Found a match. Add to list and clear node |
||||
836 | $pass = 'not' === $ps_selector ? !$pass : $pass; |
||||
837 | if ($pass) { |
||||
838 | $ret[$node->_[self::HDOM_INFO_BEGIN]] = 1; |
||||
839 | } |
||||
840 | unset($node); |
||||
841 | } |
||||
842 | } |
||||
843 | |||||
844 | protected function match($exp, $pattern, $value, $case_sensitivity) |
||||
845 | { |
||||
846 | if ('i' === $case_sensitivity) { |
||||
847 | $pattern = strtolower($pattern); |
||||
848 | $value = strtolower($value); |
||||
849 | } |
||||
850 | |||||
851 | // Apply the same rules for the pattern and attribute value |
||||
852 | // Attribute values must not contain control characters other than space |
||||
853 | // https://www.w3.org/TR/html/dom.html#text-content |
||||
854 | // https://www.w3.org/TR/html/syntax.html#attribute-values |
||||
855 | // https://www.w3.org/TR/xml/#AVNormalize |
||||
856 | $pattern = preg_replace("/[\r\n\t\s]+/u", ' ', $pattern); |
||||
857 | $pattern = trim($pattern); |
||||
858 | |||||
859 | $value = preg_replace("/[\r\n\t\s]+/u", ' ', $value); |
||||
860 | $value = trim($value); |
||||
861 | |||||
862 | switch ($exp) { |
||||
863 | case '=': |
||||
864 | return $value === $pattern; |
||||
865 | case '!=': |
||||
866 | return $value !== $pattern; |
||||
867 | case '^=': |
||||
868 | return preg_match('/^' . preg_quote($pattern, '/') . '/', $value); |
||||
869 | case '$=': |
||||
870 | return preg_match('/' . preg_quote($pattern, '/') . '$/', $value); |
||||
871 | case '*=': |
||||
872 | return preg_match('/' . preg_quote($pattern, '/') . '/', $value); |
||||
873 | case '|=': |
||||
874 | /* |
||||
875 | * [att|=val] |
||||
876 | * |
||||
877 | * Represents an element with the att attribute, its value |
||||
878 | * either being exactly "val" or beginning with "val" |
||||
879 | * immediately followed by "-" (U+002D). |
||||
880 | */ |
||||
881 | return 0 === strpos($value, $pattern); |
||||
882 | case '~=': |
||||
883 | /* |
||||
884 | * [att~=val] |
||||
885 | * |
||||
886 | * Represents an element with the att attribute whose value is a |
||||
887 | * whitespace-separated list of words, one of which is exactly |
||||
888 | * "val". If "val" contains whitespace, it will never represent |
||||
889 | * anything (since the words are separated by spaces). Also if |
||||
890 | * "val" is the empty string, it will never represent anything. |
||||
891 | */ |
||||
892 | return in_array($pattern, explode(' ', trim($value)), true); |
||||
893 | } |
||||
894 | |||||
895 | Debug::log('Unhandled attribute selector: ' . $exp . '!'); |
||||
896 | |||||
897 | return false; |
||||
898 | } |
||||
899 | |||||
900 | protected function parse_selector($selector_string) |
||||
901 | { |
||||
902 | /** |
||||
903 | * Pattern of CSS selectors, modified from mootools (https://mootools.net/). |
||||
904 | * |
||||
905 | * Paperg: Add the colon to the attribute, so that it properly finds |
||||
906 | * <tag attr:ibute="something" > like google does. |
||||
907 | * |
||||
908 | * Note: if you try to look at this attribute, you MUST use getAttribute |
||||
909 | * since $dom->x:y will fail the php syntax check. |
||||
910 | * |
||||
911 | * Notice the \[ starting the attribute? and the @? following? This |
||||
912 | * implies that an attribute can begin with an @ sign that is not |
||||
913 | * captured. This implies that an html attribute specifier may start |
||||
914 | * with an @ sign that is NOT captured by the expression. Farther study |
||||
915 | * is required to determine of this should be documented or removed. |
||||
916 | * |
||||
917 | * Matches selectors in this order: |
||||
918 | * |
||||
919 | * [0] - full match |
||||
920 | * |
||||
921 | * [1] - pseudo selector |
||||
922 | * (?:\:(\w+)\()? |
||||
923 | * Matches the pseudo selector (optional) |
||||
924 | * |
||||
925 | * [2] - tag name |
||||
926 | * ([\w:\*-]*) |
||||
927 | * Matches the tag name consisting of zero or more words, colons, |
||||
928 | * asterisks and hyphens. |
||||
929 | * |
||||
930 | * [3] - pseudo selector |
||||
931 | * (?:\:(\w+)\()? |
||||
932 | * Matches the pseudo selector (optional) |
||||
933 | * |
||||
934 | * [4] - id name |
||||
935 | * (?:\#([\w-]+)) |
||||
936 | * Optionally matches a id name, consisting of an "#" followed by |
||||
937 | * the id name (one or more words and hyphens). |
||||
938 | * |
||||
939 | * [5] - class names (including dots) |
||||
940 | * (?:\.([\w\.-]+))? |
||||
941 | * Optionally matches a list of classs, consisting of an "." |
||||
942 | * followed by the class name (one or more words and hyphens) |
||||
943 | * where multiple classes can be chained (i.e. ".foo.bar.baz") |
||||
944 | * |
||||
945 | * [6] - attributes |
||||
946 | * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)? |
||||
947 | * Optionally matches the attributes list |
||||
948 | * |
||||
949 | * [7] - separator |
||||
950 | * ([\/, >+~]+) |
||||
951 | * Matches the selector list separator |
||||
952 | */ |
||||
953 | // phpcs:ignore Generic.Files.LineLength |
||||
954 | $pattern = "/(?:\:(\w+)\()?([\w:\*-]*)(?:\:(\w+)\()?(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?(?:\))?(?:\))?([\/, >+~]+)/is"; |
||||
955 | |||||
956 | preg_match_all( |
||||
957 | $pattern, |
||||
958 | trim($selector_string) . ' ', // Add final ' ' as pseudo separator |
||||
959 | $matches, |
||||
960 | PREG_SET_ORDER |
||||
961 | ); |
||||
962 | |||||
963 | $selectors = []; |
||||
964 | $result = []; |
||||
965 | |||||
966 | foreach ($matches as $m) { |
||||
967 | $m[0] = trim($m[0]); |
||||
968 | |||||
969 | // Skip NoOps |
||||
970 | if ('' === $m[0] || '/' === $m[0] || '//' === $m[0]) { |
||||
971 | continue; |
||||
972 | } |
||||
973 | |||||
974 | array_shift($m); |
||||
975 | |||||
976 | // Convert to lowercase |
||||
977 | if ($this->dom->lowercase) { |
||||
978 | $m[1] = strtolower($m[1]); |
||||
979 | } |
||||
980 | |||||
981 | // Extract classes |
||||
982 | if ('' !== $m[4]) { |
||||
983 | $m[4] = explode('.', $m[4]); |
||||
984 | } |
||||
985 | |||||
986 | /* Extract attributes (pattern based on the pattern above!) |
||||
987 | |||||
988 | * [0] - full match |
||||
989 | * [1] - attribute name |
||||
990 | * [2] - attribute expression |
||||
991 | * [3] - attribute value |
||||
992 | * [4] - case sensitivity |
||||
993 | * |
||||
994 | * Note: Attributes can be negated with a "!" prefix to their name |
||||
995 | */ |
||||
996 | if ('' !== $m[5]) { |
||||
997 | preg_match_all( |
||||
998 | "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is", |
||||
999 | trim($m[5]), |
||||
1000 | $attributes, |
||||
1001 | PREG_SET_ORDER |
||||
1002 | ); |
||||
1003 | |||||
1004 | // Replace element by array |
||||
1005 | $m[5] = []; |
||||
1006 | |||||
1007 | foreach ($attributes as $att) { |
||||
1008 | // Skip empty matches |
||||
1009 | if ('' === trim($att[0])) { |
||||
1010 | continue; |
||||
1011 | } |
||||
1012 | |||||
1013 | $inverted = (isset($att[1][0]) && '!' === $att[1][0]); |
||||
1014 | $m[5][] = [ |
||||
1015 | $inverted ? substr($att[1], 1) : $att[1], // Name |
||||
1016 | (isset($att[2])) ? $att[2] : '', // Expression |
||||
1017 | (isset($att[3])) ? $att[3] : '', // Value |
||||
1018 | $inverted, // Inverted Flag |
||||
1019 | (isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity |
||||
1020 | ]; |
||||
1021 | } |
||||
1022 | } |
||||
1023 | |||||
1024 | // Sanitize Separator |
||||
1025 | if ('' !== $m[6] && '' === trim($m[6])) { // Descendant Separator |
||||
1026 | $m[6] = ' '; |
||||
1027 | } else { // Other Separator |
||||
1028 | $m[6] = trim($m[6]); |
||||
1029 | } |
||||
1030 | |||||
1031 | // Clear Separator if it's a Selector List |
||||
1032 | if ($is_list = (',' === $m[6])) { |
||||
1033 | $m[6] = ''; |
||||
1034 | } |
||||
1035 | |||||
1036 | $result[] = $m; |
||||
1037 | |||||
1038 | if ($is_list) { // Selector List |
||||
1039 | $selectors[] = $result; |
||||
1040 | $result = []; |
||||
1041 | } |
||||
1042 | } |
||||
1043 | |||||
1044 | if (count($result) > 0) { |
||||
1045 | $selectors[] = $result; |
||||
1046 | } |
||||
1047 | |||||
1048 | return $selectors; |
||||
1049 | } |
||||
1050 | |||||
1051 | public function __get($name) |
||||
1052 | { |
||||
1053 | if (isset($this->attr[$name])) { |
||||
1054 | return $this->convert_text($this->attr[$name]); |
||||
1055 | } |
||||
1056 | |||||
1057 | switch ($name) { |
||||
1058 | case 'outertext': |
||||
1059 | return $this->outertext(); |
||||
1060 | case 'innertext': |
||||
1061 | return $this->innertext(); |
||||
1062 | case 'plaintext': |
||||
1063 | return $this->text(); |
||||
1064 | case 'xmltext': |
||||
1065 | return $this->xmltext(); |
||||
1066 | } |
||||
1067 | |||||
1068 | return false; |
||||
1069 | } |
||||
1070 | |||||
1071 | public function __set($name, $value) |
||||
1072 | { |
||||
1073 | switch ($name) { |
||||
1074 | case 'outertext': |
||||
1075 | $this->_[self::HDOM_INFO_OUTER] = $value; |
||||
1076 | break; |
||||
1077 | case 'innertext': |
||||
1078 | if (isset($this->_[self::HDOM_INFO_TEXT])) { |
||||
1079 | $this->_[self::HDOM_INFO_TEXT] = ''; |
||||
1080 | } |
||||
1081 | $this->_[self::HDOM_INFO_INNER] = $value; |
||||
1082 | break; |
||||
1083 | default: |
||||
1084 | $this->attr[$name] = $value; |
||||
1085 | } |
||||
1086 | } |
||||
1087 | |||||
1088 | public function __isset($name) |
||||
1089 | { |
||||
1090 | switch ($name) { |
||||
1091 | case 'outertext': |
||||
1092 | return true; |
||||
1093 | case 'innertext': |
||||
1094 | return true; |
||||
1095 | case 'plaintext': |
||||
1096 | return true; |
||||
1097 | } |
||||
1098 | |||||
1099 | return isset($this->attr[$name]); |
||||
1100 | } |
||||
1101 | |||||
1102 | public function __unset($name) |
||||
1103 | { |
||||
1104 | if (isset($this->attr[$name])) { |
||||
1105 | unset($this->attr[$name]); |
||||
1106 | } |
||||
1107 | } |
||||
1108 | |||||
1109 | public function convert_text($text) |
||||
1110 | { |
||||
1111 | $converted_text = $text; |
||||
1112 | |||||
1113 | $sourceCharset = ''; |
||||
1114 | $targetCharset = ''; |
||||
1115 | |||||
1116 | if ($this->dom) { |
||||
1117 | $sourceCharset = strtoupper($this->dom->_charset); |
||||
1118 | $targetCharset = strtoupper($this->dom->_target_charset); |
||||
1119 | } |
||||
1120 | |||||
1121 | if (!empty($sourceCharset) && !empty($targetCharset)) { |
||||
1122 | if (strtoupper($sourceCharset) === strtoupper($targetCharset)) { |
||||
1123 | $converted_text = $text; |
||||
1124 | } elseif (('UTF-8' === strtoupper($targetCharset)) && (self::is_utf8($text))) { |
||||
1125 | Debug::log_once('The source charset was incorrectly detected as ' . $sourceCharset . ' but should have been UTF-8'); |
||||
1126 | $converted_text = $text; |
||||
1127 | } else { |
||||
1128 | $converted_text = iconv($sourceCharset, $targetCharset, $text); |
||||
1129 | } |
||||
1130 | } |
||||
1131 | |||||
1132 | // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. |
||||
1133 | if ('UTF-8' === $targetCharset) { |
||||
1134 | if ("\xef\xbb\xbf" === substr($converted_text, 0, 3)) { |
||||
1135 | $converted_text = substr($converted_text, 3); |
||||
1136 | } |
||||
1137 | |||||
1138 | if ("\xef\xbb\xbf" === substr($converted_text, -3)) { |
||||
1139 | $converted_text = substr($converted_text, 0, -3); |
||||
1140 | } |
||||
1141 | } |
||||
1142 | |||||
1143 | return $converted_text; |
||||
1144 | } |
||||
1145 | |||||
1146 | public static function is_utf8($str) |
||||
1147 | { |
||||
1148 | $c = 0; |
||||
0 ignored issues
–
show
|
|||||
1149 | $b = 0; |
||||
0 ignored issues
–
show
|
|||||
1150 | $bits = 0; |
||||
1151 | $len = strlen($str); |
||||
1152 | for ($i = 0; $i < $len; ++$i) { |
||||
1153 | $c = ord($str[$i]); |
||||
1154 | if ($c > 128) { |
||||
1155 | if (($c >= 254)) { |
||||
1156 | return false; |
||||
1157 | } elseif ($c >= 252) { |
||||
1158 | $bits = 6; |
||||
1159 | } elseif ($c >= 248) { |
||||
1160 | $bits = 5; |
||||
1161 | } elseif ($c >= 240) { |
||||
1162 | $bits = 4; |
||||
1163 | } elseif ($c >= 224) { |
||||
1164 | $bits = 3; |
||||
1165 | } elseif ($c >= 192) { |
||||
1166 | $bits = 2; |
||||
1167 | } else { |
||||
1168 | return false; |
||||
1169 | } |
||||
1170 | if (($i + $bits) > $len) { |
||||
1171 | return false; |
||||
1172 | } |
||||
1173 | while ($bits > 1) { |
||||
1174 | ++$i; |
||||
1175 | $b = ord($str[$i]); |
||||
1176 | if ($b < 128 || $b > 191) { |
||||
1177 | return false; |
||||
1178 | } |
||||
1179 | --$bits; |
||||
1180 | } |
||||
1181 | } |
||||
1182 | } |
||||
1183 | |||||
1184 | return true; |
||||
1185 | } |
||||
1186 | |||||
1187 | public function get_display_size() |
||||
1188 | { |
||||
1189 | $width = -1; |
||||
1190 | $height = -1; |
||||
1191 | |||||
1192 | if ('img' !== $this->tag) { |
||||
1193 | return false; |
||||
1194 | } |
||||
1195 | |||||
1196 | // See if there is aheight or width attribute in the tag itself. |
||||
1197 | if (isset($this->attr['width'])) { |
||||
1198 | $width = $this->attr['width']; |
||||
1199 | } |
||||
1200 | |||||
1201 | if (isset($this->attr['height'])) { |
||||
1202 | $height = $this->attr['height']; |
||||
1203 | } |
||||
1204 | |||||
1205 | // Now look for an inline style. |
||||
1206 | if (isset($this->attr['style'])) { |
||||
1207 | // Thanks to user gnarf from stackoverflow for this regular expression. |
||||
1208 | $attributes = []; |
||||
1209 | |||||
1210 | preg_match_all( |
||||
1211 | '/([\w-]+)\s*:\s*([^;]+)\s*;?/', |
||||
1212 | $this->attr['style'], |
||||
1213 | $matches, |
||||
1214 | PREG_SET_ORDER |
||||
1215 | ); |
||||
1216 | |||||
1217 | foreach ($matches as $match) { |
||||
1218 | $attributes[$match[1]] = $match[2]; |
||||
1219 | } |
||||
1220 | |||||
1221 | // If there is a width in the style attributes: |
||||
1222 | if (isset($attributes['width']) && -1 == $width) { |
||||
1223 | // check that the last two characters are px (pixels) |
||||
1224 | if ('px' === strtolower(substr($attributes['width'], -2))) { |
||||
1225 | $proposed_width = substr($attributes['width'], 0, -2); |
||||
1226 | // Now make sure that it's an integer and not something stupid. |
||||
1227 | if (filter_var($proposed_width, FILTER_VALIDATE_INT)) { |
||||
1228 | $width = $proposed_width; |
||||
1229 | } |
||||
1230 | } |
||||
1231 | } |
||||
1232 | |||||
1233 | // If there is a width in the style attributes: |
||||
1234 | if (isset($attributes['height']) && -1 == $height) { |
||||
1235 | // check that the last two characters are px (pixels) |
||||
1236 | if ('px' == strtolower(substr($attributes['height'], -2))) { |
||||
1237 | $proposed_height = substr($attributes['height'], 0, -2); |
||||
1238 | // Now make sure that it's an integer and not something stupid. |
||||
1239 | if (filter_var($proposed_height, FILTER_VALIDATE_INT)) { |
||||
1240 | $height = $proposed_height; |
||||
1241 | } |
||||
1242 | } |
||||
1243 | } |
||||
1244 | } |
||||
1245 | |||||
1246 | // Future enhancement: |
||||
1247 | // Look in the tag to see if there is a class or id specified that has |
||||
1248 | // a height or width attribute to it. |
||||
1249 | |||||
1250 | // Far future enhancement |
||||
1251 | // Look at all the parent tags of this image to see if they specify a |
||||
1252 | // class or id that has an img selector that specifies a height or width |
||||
1253 | // Note that in this case, the class or id will have the img subselector |
||||
1254 | // for it to apply to the image. |
||||
1255 | |||||
1256 | // ridiculously far future development |
||||
1257 | // If the class or id is specified in a SEPARATE css file thats not on |
||||
1258 | // the page, go get it and do what we were just doing for the ones on |
||||
1259 | // the page. |
||||
1260 | |||||
1261 | $result = [ |
||||
1262 | 'height' => $height, |
||||
1263 | 'width' => $width, |
||||
1264 | ]; |
||||
1265 | |||||
1266 | return $result; |
||||
1267 | } |
||||
1268 | |||||
1269 | public function save($filepath = '') |
||||
1270 | { |
||||
1271 | $ret = $this->outertext(); |
||||
1272 | |||||
1273 | if ('' !== $filepath) { |
||||
1274 | file_put_contents($filepath, $ret, LOCK_EX); |
||||
1275 | } |
||||
1276 | |||||
1277 | return $ret; |
||||
1278 | } |
||||
1279 | |||||
1280 | public function addClass($class) |
||||
1281 | { |
||||
1282 | if (is_string($class)) { |
||||
1283 | $class = explode(' ', $class); |
||||
1284 | } |
||||
1285 | |||||
1286 | if (is_array($class)) { |
||||
1287 | foreach ($class as $c) { |
||||
1288 | if (isset($this->class)) { |
||||
1289 | if ($this->hasClass($c)) { |
||||
1290 | continue; |
||||
1291 | } else { |
||||
1292 | $this->class .= ' ' . $c; |
||||
1293 | } |
||||
1294 | } else { |
||||
1295 | $this->class = $c; |
||||
0 ignored issues
–
show
|
|||||
1296 | } |
||||
1297 | } |
||||
1298 | } |
||||
1299 | } |
||||
1300 | |||||
1301 | public function hasClass($class) |
||||
1302 | { |
||||
1303 | if (is_string($class)) { |
||||
1304 | if (isset($this->class)) { |
||||
1305 | return in_array($class, explode(' ', $this->class), true); |
||||
1306 | } |
||||
1307 | } |
||||
1308 | |||||
1309 | return false; |
||||
1310 | } |
||||
1311 | |||||
1312 | public function removeClass($class = null) |
||||
1313 | { |
||||
1314 | if (!isset($this->class)) { |
||||
1315 | return; |
||||
1316 | } |
||||
1317 | |||||
1318 | if (is_null($class)) { |
||||
1319 | $this->removeAttribute('class'); |
||||
1320 | |||||
1321 | return; |
||||
1322 | } |
||||
1323 | |||||
1324 | if (is_string($class)) { |
||||
1325 | $class = explode(' ', $class); |
||||
1326 | } |
||||
1327 | |||||
1328 | if (is_array($class)) { |
||||
1329 | $class = array_diff(explode(' ', $this->class), $class); |
||||
1330 | if (empty($class)) { |
||||
1331 | $this->removeAttribute('class'); |
||||
1332 | } else { |
||||
1333 | $this->class = implode(' ', $class); |
||||
0 ignored issues
–
show
|
|||||
1334 | } |
||||
1335 | } |
||||
1336 | } |
||||
1337 | |||||
1338 | public function getAllAttributes() |
||||
1339 | { |
||||
1340 | return $this->attr; |
||||
1341 | } |
||||
1342 | |||||
1343 | public function getAttribute($name) |
||||
1344 | { |
||||
1345 | return $this->$name; |
||||
1346 | } |
||||
1347 | |||||
1348 | public function setAttribute($name, $value) |
||||
1349 | { |
||||
1350 | $this->$name = $value; |
||||
1351 | } |
||||
1352 | |||||
1353 | public function hasAttribute($name) |
||||
1354 | { |
||||
1355 | return isset($this->$name); |
||||
1356 | } |
||||
1357 | |||||
1358 | public function removeAttribute($name) |
||||
1359 | { |
||||
1360 | unset($this->$name); |
||||
1361 | } |
||||
1362 | |||||
1363 | public function remove() |
||||
1364 | { |
||||
1365 | if ($this->parent) { |
||||
1366 | $this->parent->removeChild($this); |
||||
1367 | } |
||||
1368 | } |
||||
1369 | |||||
1370 | public function removeChild($node) |
||||
1371 | { |
||||
1372 | foreach ($node->children as $child) { |
||||
1373 | $node->removeChild($child); |
||||
1374 | } |
||||
1375 | |||||
1376 | // No need to re-index node->children because it is about to be removed! |
||||
1377 | |||||
1378 | foreach ($node->nodes as $entity) { |
||||
1379 | $enidx = array_search($entity, $node->nodes, true); |
||||
1380 | $edidx = array_search($entity, $node->dom->nodes, true); |
||||
1381 | |||||
1382 | if (false !== $enidx) { |
||||
1383 | unset($node->nodes[$enidx]); |
||||
1384 | } |
||||
1385 | |||||
1386 | if (false !== $edidx) { |
||||
1387 | unset($node->dom->nodes[$edidx]); |
||||
1388 | } |
||||
1389 | } |
||||
1390 | |||||
1391 | // No need to re-index node->nodes because it is about to be removed! |
||||
1392 | |||||
1393 | $nidx = array_search($node, $this->nodes, true); |
||||
1394 | $cidx = array_search($node, $this->children, true); |
||||
1395 | $didx = array_search($node, $this->dom->nodes, true); |
||||
1396 | |||||
1397 | if (false !== $nidx) { |
||||
1398 | unset($this->nodes[$nidx]); |
||||
1399 | } |
||||
1400 | |||||
1401 | $this->nodes = array_values($this->nodes); |
||||
1402 | |||||
1403 | if (false !== $cidx) { |
||||
1404 | unset($this->children[$cidx]); |
||||
1405 | } |
||||
1406 | |||||
1407 | $this->children = array_values($this->children); |
||||
1408 | |||||
1409 | if (false !== $didx) { |
||||
1410 | unset($this->dom->nodes[$didx]); |
||||
1411 | } |
||||
1412 | |||||
1413 | // Do not re-index dom->nodes because nodes point to other nodes in the |
||||
1414 | // array explicitly! |
||||
1415 | |||||
1416 | $node->clear(); |
||||
1417 | } |
||||
1418 | |||||
1419 | public function getElementById($id) |
||||
1420 | { |
||||
1421 | return $this->find("#$id", 0); |
||||
1422 | } |
||||
1423 | |||||
1424 | public function getElementsById($id, $idx = null) |
||||
1425 | { |
||||
1426 | return $this->find("#$id", $idx); |
||||
1427 | } |
||||
1428 | |||||
1429 | public function getElementByTagName($name) |
||||
1430 | { |
||||
1431 | return $this->find($name, 0); |
||||
1432 | } |
||||
1433 | |||||
1434 | public function getElementsByTagName($name, $idx = null) |
||||
1435 | { |
||||
1436 | return $this->find($name, $idx); |
||||
1437 | } |
||||
1438 | |||||
1439 | public function parentNode() |
||||
1440 | { |
||||
1441 | return $this->parent(); |
||||
1442 | } |
||||
1443 | |||||
1444 | public function childNodes($idx = -1) |
||||
1445 | { |
||||
1446 | if (-1 === $idx) { |
||||
1447 | return $this->children; |
||||
1448 | } |
||||
1449 | |||||
1450 | if (isset($this->children[$idx])) { |
||||
1451 | return $this->children[$idx]; |
||||
1452 | } |
||||
1453 | |||||
1454 | return null; |
||||
1455 | } |
||||
1456 | |||||
1457 | public function firstChild() |
||||
1458 | { |
||||
1459 | if (count($this->children) > 0) { |
||||
1460 | return $this->children[0]; |
||||
1461 | } |
||||
1462 | |||||
1463 | return null; |
||||
1464 | } |
||||
1465 | |||||
1466 | public function lastChild() |
||||
1467 | { |
||||
1468 | if (count($this->children) > 0) { |
||||
1469 | return end($this->children); |
||||
1470 | } |
||||
1471 | |||||
1472 | return null; |
||||
1473 | } |
||||
1474 | |||||
1475 | public function nextSibling() |
||||
1476 | { |
||||
1477 | if (null === $this->parent) { |
||||
1478 | return null; |
||||
1479 | } |
||||
1480 | |||||
1481 | $idx = array_search($this, $this->parent->children, true); |
||||
1482 | |||||
1483 | if (false !== $idx && isset($this->parent->children[$idx + 1])) { |
||||
1484 | return $this->parent->children[$idx + 1]; |
||||
1485 | } |
||||
1486 | |||||
1487 | return null; |
||||
1488 | } |
||||
1489 | |||||
1490 | public function previousSibling() |
||||
1491 | { |
||||
1492 | if (null === $this->parent) { |
||||
1493 | return null; |
||||
1494 | } |
||||
1495 | |||||
1496 | $idx = array_search($this, $this->parent->children, true); |
||||
1497 | |||||
1498 | if (false !== $idx && $idx > 0) { |
||||
1499 | return $this->parent->children[$idx - 1]; |
||||
1500 | } |
||||
1501 | |||||
1502 | return null; |
||||
1503 | } |
||||
1504 | |||||
1505 | public function hasChildNodes() |
||||
1506 | { |
||||
1507 | return !empty($this->children); |
||||
1508 | } |
||||
1509 | |||||
1510 | public function nodeName() |
||||
1511 | { |
||||
1512 | return $this->tag; |
||||
1513 | } |
||||
1514 | |||||
1515 | public function appendChild($node) |
||||
1516 | { |
||||
1517 | $node->parent = $this; |
||||
1518 | $this->nodes[] = $node; |
||||
1519 | $this->children[] = $node; |
||||
1520 | |||||
1521 | if ($this->dom) { // Attach current node to DOM (recursively) |
||||
1522 | $children = [$node]; |
||||
1523 | |||||
1524 | while ($children) { |
||||
1525 | $child = array_pop($children); |
||||
1526 | $children = array_merge($children, $child->children); |
||||
1527 | |||||
1528 | $this->dom->nodes[] = $child; |
||||
1529 | $child->dom = $this->dom; |
||||
1530 | $child->_[self::HDOM_INFO_BEGIN] = count($this->dom->nodes) - 1; |
||||
1531 | $child->_[self::HDOM_INFO_END] = $child->_[self::HDOM_INFO_BEGIN]; |
||||
1532 | } |
||||
1533 | |||||
1534 | $this->dom->root->_[self::HDOM_INFO_END] = count($this->dom->nodes) - 1; |
||||
1535 | } |
||||
1536 | |||||
1537 | return $this; |
||||
1538 | } |
||||
1539 | } |
||||
1540 |