1 | <?php |
||||||
2 | |||||||
3 | namespace SunnysideUp\ShareThis; |
||||||
4 | |||||||
5 | /** |
||||||
6 | * Website: http://sourceforge.net/projects/simplehtmldom/ |
||||||
7 | * Additional projects that may be used: http://sourceforge.net/projects/debugobject/ |
||||||
8 | * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) |
||||||
9 | * Contributions by: |
||||||
10 | * Yousuke Kumakura (Attribute filters) |
||||||
11 | * Vadim Voituk (Negative indexes supports of "find" method) |
||||||
12 | * Antcs (Constructor with automatically load contents either text or file/url) |
||||||
13 | * |
||||||
14 | * all affected sections have comments starting with "PaperG" |
||||||
15 | * |
||||||
16 | * Paperg - Added case insensitive testing of the value of the selector. |
||||||
17 | * Paperg - Added tag_start for the starting index of tags - NOTE: This works but not accurately. |
||||||
18 | * This tag_start gets counted AFTER \r\n have been crushed out, and after the remove_noice calls so it will not reflect the REAL position of the tag in the source, |
||||||
19 | * it will almost always be smaller by some amount. |
||||||
20 | * We use this to determine how far into the file the tag in question is. This "percentage will never be accurate as the $dom->size is the "real" number of bytes the dom was created from. |
||||||
21 | * but for most purposes, it's a really good estimation. |
||||||
22 | * Paperg - Added the forceTagsClosed to the dom constructor. Forcing tags closed is great for malformed html, but it CAN lead to parsing errors. |
||||||
23 | * Allow the user to tell us how much they trust the html. |
||||||
24 | * Paperg add the text and plaintext to the selectors for the find syntax. plaintext implies text in the innertext of a node. text implies that the tag is a text node. |
||||||
25 | * This allows for us to find tags based on the text they contain. |
||||||
26 | * Create find_ancestor_tag to see if a tag is - at any level - inside of another specific tag. |
||||||
27 | * Paperg: added parse_charset so that we know about the character set of the source document. |
||||||
28 | * NOTE: If the user's system has a routine called get_last_retrieve_url_contents_content_type availalbe, we will assume it's returning the content-type header from the |
||||||
29 | * last transfer or curl_exec, and we will parse that and use it in preference to any other method of charset detection. |
||||||
30 | * |
||||||
31 | * Found infinite loop in the case of broken html in restore_noise. Rewrote to protect from that. |
||||||
32 | * PaperG (John Schlick) Added get_display_size for "IMG" tags. |
||||||
33 | * |
||||||
34 | * Licensed under The MIT License |
||||||
35 | * Redistributions of files must retain the above copyright notice. |
||||||
36 | * |
||||||
37 | * @author S.C. Chen <[email protected]> |
||||||
38 | * @author John Schlick |
||||||
39 | * @author Rus Carroll |
||||||
40 | * @version 1.5 ($Rev: 202 $) |
||||||
41 | * @package PlaceLocalInclude |
||||||
42 | * @subpackage simple_html_dom |
||||||
43 | */ |
||||||
44 | |||||||
45 | /** |
||||||
46 | * All of the Defines for the classes below. |
||||||
47 | * @author S.C. Chen <[email protected]> |
||||||
48 | */ |
||||||
49 | define('HDOM_TYPE_ELEMENT', 1); |
||||||
50 | define('HDOM_TYPE_COMMENT', 2); |
||||||
51 | define('HDOM_TYPE_TEXT', 3); |
||||||
52 | define('HDOM_TYPE_ENDTAG', 4); |
||||||
53 | define('HDOM_TYPE_ROOT', 5); |
||||||
54 | define('HDOM_TYPE_UNKNOWN', 6); |
||||||
55 | define('HDOM_QUOTE_DOUBLE', 0); |
||||||
56 | define('HDOM_QUOTE_SINGLE', 1); |
||||||
57 | define('HDOM_QUOTE_NO', 3); |
||||||
58 | define('HDOM_INFO_BEGIN', 0); |
||||||
59 | define('HDOM_INFO_END', 1); |
||||||
60 | define('HDOM_INFO_QUOTE', 2); |
||||||
61 | define('HDOM_INFO_SPACE', 3); |
||||||
62 | define('HDOM_INFO_TEXT', 4); |
||||||
63 | define('HDOM_INFO_INNER', 5); |
||||||
64 | define('HDOM_INFO_OUTER', 6); |
||||||
65 | define('HDOM_INFO_ENDSPACE', 7); |
||||||
66 | define('DEFAULT_TARGET_CHARSET', 'UTF-8'); |
||||||
67 | define('DEFAULT_BR_TEXT', "\r\n"); |
||||||
68 | define('DEFAULT_SPAN_TEXT', " "); |
||||||
69 | define('MAX_FILE_SIZE', 600000); |
||||||
70 | // helper functions |
||||||
71 | // ----------------------------------------------------------------------------- |
||||||
72 | // get html dom from file |
||||||
73 | // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1. |
||||||
74 | function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) |
||||||
0 ignored issues
–
show
|
|||||||
75 | { |
||||||
76 | // We DO force the tags to be terminated. |
||||||
77 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); |
||||||
78 | // For sourceforge users: uncomment the next line and comment the retreive_url_contents line 2 lines down if it is not already done. |
||||||
79 | $contents = file_get_contents($url, $use_include_path, $context, $offset); |
||||||
80 | // Paperg - use our own mechanism for getting the contents as we want to control the timeout. |
||||||
81 | //$contents = retrieve_url_contents($url); |
||||||
82 | if (empty($contents) || strlen($contents) > MAX_FILE_SIZE) { |
||||||
83 | return false; |
||||||
84 | } |
||||||
85 | // The second parameter can force the selectors to all be lowercase. |
||||||
86 | $dom->load($contents, $lowercase, $stripRN); |
||||||
87 | return $dom; |
||||||
88 | } |
||||||
89 | |||||||
90 | // get html dom from string |
||||||
91 | function str_get_html($str, $lowercase=true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) |
||||||
92 | { |
||||||
93 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText); |
||||||
94 | if (empty($str) || strlen($str) > MAX_FILE_SIZE) { |
||||||
95 | $dom->clear(); |
||||||
96 | return false; |
||||||
97 | } |
||||||
98 | $dom->load($str, $lowercase, $stripRN); |
||||||
99 | return $dom; |
||||||
100 | } |
||||||
101 | |||||||
102 | // dump html dom tree |
||||||
103 | function dump_html_tree($node, $show_attr=true, $deep=0) |
||||||
0 ignored issues
–
show
The parameter
$deep is not used and could be removed.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check looks for parameters that have been defined for a function or method, but which are not used in the method body.
Loading history...
The parameter
$show_attr is not used and could be removed.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check looks for parameters that have been defined for a function or method, but which are not used in the method body.
Loading history...
|
|||||||
104 | { |
||||||
105 | $node->dump($node); |
||||||
106 | } |
||||||
107 | |||||||
108 | |||||||
109 | /** |
||||||
110 | * simple html dom node |
||||||
111 | * PaperG - added ability for "find" routine to lowercase the value of the selector. |
||||||
112 | * PaperG - added $tag_start to track the start position of the tag in the total byte index |
||||||
113 | * |
||||||
114 | * @package PlaceLocalInclude |
||||||
115 | */ |
||||||
116 | class simple_html_dom_node |
||||||
117 | { |
||||||
118 | public $nodetype = HDOM_TYPE_TEXT; |
||||||
119 | public $tag = 'text'; |
||||||
120 | public $attr = []; |
||||||
121 | public $children = []; |
||||||
122 | public $nodes = []; |
||||||
123 | public $parent = null; |
||||||
124 | // The "info" array - see HDOM_INFO_... for what each element contains. |
||||||
125 | public $_ = []; |
||||||
126 | public $tag_start = 0; |
||||||
127 | private $dom = null; |
||||||
128 | |||||||
129 | public function __construct($dom) |
||||||
130 | { |
||||||
131 | $this->dom = $dom; |
||||||
132 | $dom->nodes[] = $this; |
||||||
133 | } |
||||||
134 | |||||||
135 | public function __destruct() |
||||||
136 | { |
||||||
137 | $this->clear(); |
||||||
138 | } |
||||||
139 | |||||||
140 | public function __toString() |
||||||
141 | { |
||||||
142 | return $this->outertext(); |
||||||
143 | } |
||||||
144 | |||||||
145 | // clean up memory due to php5 circular references memory leak... |
||||||
146 | public function clear() |
||||||
147 | { |
||||||
148 | $this->dom = null; |
||||||
149 | $this->nodes = null; |
||||||
150 | $this->parent = null; |
||||||
151 | $this->children = null; |
||||||
152 | } |
||||||
153 | |||||||
154 | // dump node's tree |
||||||
155 | public function dump($show_attr=true, $deep=0) |
||||||
156 | { |
||||||
157 | $lead = str_repeat(' ', $deep); |
||||||
158 | |||||||
159 | echo $lead.$this->tag; |
||||||
160 | if ($show_attr && count($this->attr)>0) { |
||||||
161 | echo '('; |
||||||
162 | foreach ($this->attr as $k=>$v) { |
||||||
163 | echo "[$k]=>\"".$this->$k.'", '; |
||||||
164 | } |
||||||
165 | echo ')'; |
||||||
166 | } |
||||||
167 | echo "\n"; |
||||||
168 | |||||||
169 | if ($this->nodes) { |
||||||
0 ignored issues
–
show
The expression
$this->nodes of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent. Consider making the comparison explicit by using
Loading history...
|
|||||||
170 | foreach ($this->nodes as $c) { |
||||||
171 | $c->dump($show_attr, $deep+1); |
||||||
172 | } |
||||||
173 | } |
||||||
174 | } |
||||||
175 | |||||||
176 | |||||||
177 | // Debugging function to dump a single dom node with a bunch of information about it. |
||||||
178 | public function dump_node($echo=true) |
||||||
179 | { |
||||||
180 | $string = $this->tag; |
||||||
181 | if (count($this->attr)>0) { |
||||||
182 | $string .= '('; |
||||||
183 | foreach ($this->attr as $k=>$v) { |
||||||
184 | $string .= "[$k]=>\"".$this->$k.'", '; |
||||||
185 | } |
||||||
186 | $string .= ')'; |
||||||
187 | } |
||||||
188 | if (count($this->_)>0) { |
||||||
189 | $string .= ' $_ ('; |
||||||
190 | foreach ($this->_ as $k=>$v) { |
||||||
191 | if (is_array($v)) { |
||||||
192 | $string .= "[$k]=>("; |
||||||
193 | foreach ($v as $k2=>$v2) { |
||||||
194 | $string .= "[$k2]=>\"".$v2.'", '; |
||||||
195 | } |
||||||
196 | $string .= ")"; |
||||||
197 | } else { |
||||||
198 | $string .= "[$k]=>\"".$v.'", '; |
||||||
199 | } |
||||||
200 | } |
||||||
201 | $string .= ")"; |
||||||
202 | } |
||||||
203 | |||||||
204 | if (isset($this->text)) { |
||||||
0 ignored issues
–
show
The property
text does not exist on SunnysideUp\ShareThis\simple_html_dom_node . Since you implemented __get , consider adding a @property annotation.
Loading history...
|
|||||||
205 | $string .= " text: (" . $this->text . ")"; |
||||||
206 | } |
||||||
207 | |||||||
208 | $string .= " HDOM_INNER_INFO: '"; |
||||||
209 | if (isset($node->_[HDOM_INFO_INNER])) { |
||||||
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
|
|||||||
210 | $string .= $node->_[HDOM_INFO_INNER] . "'"; |
||||||
211 | } else { |
||||||
212 | $string .= ' NULL '; |
||||||
213 | } |
||||||
214 | |||||||
215 | $string .= " children: " . count($this->children); |
||||||
216 | $string .= " nodes: " . count($this->nodes); |
||||||
217 | $string .= " tag_start: " . $this->tag_start; |
||||||
218 | $string .= "\n"; |
||||||
219 | |||||||
220 | if ($echo) { |
||||||
221 | echo $string; |
||||||
222 | return; |
||||||
223 | } else { |
||||||
224 | return $string; |
||||||
225 | } |
||||||
226 | } |
||||||
227 | |||||||
228 | // returns the parent of node |
||||||
229 | // If a node is passed in, it will reset the parent of the current node to that one. |
||||||
230 | public function parent($parent=null) |
||||||
231 | { |
||||||
232 | // I am SURE that this doesn't work properly. |
||||||
233 | // It fails to unset the current node from it's current parents nodes or children list first. |
||||||
234 | if ($parent !== null) { |
||||||
235 | $this->parent = $parent; |
||||||
236 | $this->parent->nodes[] = $this; |
||||||
237 | $this->parent->children[] = $this; |
||||||
238 | } |
||||||
239 | |||||||
240 | return $this->parent; |
||||||
241 | } |
||||||
242 | |||||||
243 | // verify that node has children |
||||||
244 | public function has_child() |
||||||
245 | { |
||||||
246 | return !empty($this->children); |
||||||
247 | } |
||||||
248 | |||||||
249 | // returns children of node |
||||||
250 | public function children($idx=-1) |
||||||
251 | { |
||||||
252 | if ($idx===-1) { |
||||||
253 | return $this->children; |
||||||
254 | } |
||||||
255 | if (isset($this->children[$idx])) { |
||||||
256 | return $this->children[$idx]; |
||||||
257 | } |
||||||
258 | return null; |
||||||
259 | } |
||||||
260 | |||||||
261 | // returns the first child of node |
||||||
262 | public function first_child() |
||||||
263 | { |
||||||
264 | if (count($this->children)>0) { |
||||||
265 | return $this->children[0]; |
||||||
266 | } |
||||||
267 | return null; |
||||||
268 | } |
||||||
269 | |||||||
270 | // returns the last child of node |
||||||
271 | public function last_child() |
||||||
272 | { |
||||||
273 | if (($count=count($this->children))>0) { |
||||||
274 | return $this->children[$count-1]; |
||||||
275 | } |
||||||
276 | return null; |
||||||
277 | } |
||||||
278 | |||||||
279 | // returns the next sibling of node |
||||||
280 | public function next_sibling() |
||||||
281 | { |
||||||
282 | if ($this->parent===null) { |
||||||
283 | return null; |
||||||
284 | } |
||||||
285 | |||||||
286 | $idx = 0; |
||||||
287 | $count = count($this->parent->children); |
||||||
288 | while ($idx<$count && $this!==$this->parent->children[$idx]) { |
||||||
289 | ++$idx; |
||||||
290 | } |
||||||
291 | if (++$idx>=$count) { |
||||||
292 | return null; |
||||||
293 | } |
||||||
294 | return $this->parent->children[$idx]; |
||||||
295 | } |
||||||
296 | |||||||
297 | // returns the previous sibling of node |
||||||
298 | public function prev_sibling() |
||||||
299 | { |
||||||
300 | if ($this->parent===null) { |
||||||
301 | return null; |
||||||
302 | } |
||||||
303 | $idx = 0; |
||||||
304 | $count = count($this->parent->children); |
||||||
305 | while ($idx<$count && $this!==$this->parent->children[$idx]) { |
||||||
306 | ++$idx; |
||||||
307 | } |
||||||
308 | if (--$idx<0) { |
||||||
309 | return null; |
||||||
310 | } |
||||||
311 | return $this->parent->children[$idx]; |
||||||
312 | } |
||||||
313 | |||||||
314 | // function to locate a specific ancestor tag in the path to the root. |
||||||
315 | public function find_ancestor_tag($tag) |
||||||
316 | { |
||||||
317 | global $debug_object; |
||||||
318 | if (is_object($debug_object)) { |
||||||
319 | $debug_object->debugLogEntry(1); |
||||||
320 | } |
||||||
321 | |||||||
322 | // Start by including ourselves in the comparison. |
||||||
323 | $returnDom = $this; |
||||||
324 | |||||||
325 | while (!is_null($returnDom)) { |
||||||
326 | if (is_object($debug_object)) { |
||||||
327 | $debug_object->debugLog(2, "Current tag is: " . $returnDom->tag); |
||||||
328 | } |
||||||
329 | |||||||
330 | if ($returnDom->tag == $tag) { |
||||||
331 | break; |
||||||
332 | } |
||||||
333 | $returnDom = $returnDom->parent; |
||||||
334 | } |
||||||
335 | return $returnDom; |
||||||
336 | } |
||||||
337 | |||||||
338 | // get dom node's inner html |
||||||
339 | public function innertext() |
||||||
340 | { |
||||||
341 | if (isset($this->_[HDOM_INFO_INNER])) { |
||||||
342 | return $this->_[HDOM_INFO_INNER]; |
||||||
343 | } |
||||||
344 | if (isset($this->_[HDOM_INFO_TEXT])) { |
||||||
345 | return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); |
||||||
346 | } |
||||||
347 | |||||||
348 | $ret = ''; |
||||||
349 | foreach ($this->nodes as $n) { |
||||||
350 | $ret .= $n->outertext(); |
||||||
351 | } |
||||||
352 | return $ret; |
||||||
353 | } |
||||||
354 | |||||||
355 | // get dom node's outer text (with tag) |
||||||
356 | public function outertext() |
||||||
357 | { |
||||||
358 | global $debug_object; |
||||||
359 | if (is_object($debug_object)) { |
||||||
360 | $text = ''; |
||||||
361 | if ($this->tag == 'text') { |
||||||
362 | if (!empty($this->text)) { |
||||||
0 ignored issues
–
show
The property
text does not exist on SunnysideUp\ShareThis\simple_html_dom_node . Since you implemented __get , consider adding a @property annotation.
Loading history...
|
|||||||
363 | $text = " with text: " . $this->text; |
||||||
364 | } |
||||||
365 | } |
||||||
366 | $debug_object->debugLog(1, 'Innertext of tag: ' . $this->tag . $text); |
||||||
367 | } |
||||||
368 | |||||||
369 | if ($this->tag==='root') { |
||||||
370 | return $this->innertext(); |
||||||
371 | } |
||||||
372 | |||||||
373 | // trigger callback |
||||||
374 | if ($this->dom && $this->dom->callback!==null) { |
||||||
375 | call_user_func_array($this->dom->callback, array($this)); |
||||||
376 | } |
||||||
377 | |||||||
378 | if (isset($this->_[HDOM_INFO_OUTER])) { |
||||||
379 | return $this->_[HDOM_INFO_OUTER]; |
||||||
380 | } |
||||||
381 | if (isset($this->_[HDOM_INFO_TEXT])) { |
||||||
382 | return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); |
||||||
383 | } |
||||||
384 | |||||||
385 | // render begin tag |
||||||
386 | if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) { |
||||||
387 | $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); |
||||||
388 | } else { |
||||||
389 | $ret = ""; |
||||||
390 | } |
||||||
391 | |||||||
392 | // render inner text |
||||||
393 | if (isset($this->_[HDOM_INFO_INNER])) { |
||||||
394 | // If it's a br tag... don't return the HDOM_INNER_INFO that we may or may not have added. |
||||||
395 | if ($this->tag != "br") { |
||||||
396 | $ret .= $this->_[HDOM_INFO_INNER]; |
||||||
397 | } |
||||||
398 | } else { |
||||||
399 | if ($this->nodes) { |
||||||
0 ignored issues
–
show
The expression
$this->nodes of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent. Consider making the comparison explicit by using
Loading history...
|
|||||||
400 | foreach ($this->nodes as $n) { |
||||||
401 | $ret .= $this->convert_text($n->outertext()); |
||||||
402 | } |
||||||
403 | } |
||||||
404 | } |
||||||
405 | |||||||
406 | // render end tag |
||||||
407 | if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0) { |
||||||
408 | $ret .= '</'.$this->tag.'>'; |
||||||
409 | } |
||||||
410 | return $ret; |
||||||
411 | } |
||||||
412 | |||||||
413 | // get dom node's plain text |
||||||
414 | public function text() |
||||||
415 | { |
||||||
416 | if (isset($this->_[HDOM_INFO_INNER])) { |
||||||
417 | return $this->_[HDOM_INFO_INNER]; |
||||||
418 | } |
||||||
419 | switch ($this->nodetype) { |
||||||
420 | case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); |
||||||
421 | case HDOM_TYPE_COMMENT: return ''; |
||||||
422 | case HDOM_TYPE_UNKNOWN: return ''; |
||||||
423 | } |
||||||
424 | if (strcasecmp($this->tag, 'script')===0) { |
||||||
425 | return ''; |
||||||
426 | } |
||||||
427 | if (strcasecmp($this->tag, 'style')===0) { |
||||||
428 | return ''; |
||||||
429 | } |
||||||
430 | |||||||
431 | $ret = ''; |
||||||
432 | // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed for some span tags, and some p tags) $this->nodes is set to NULL. |
||||||
433 | // NOTE: This indicates that there is a problem where it's set to NULL without a clear happening. |
||||||
434 | // WHY is this happening? |
||||||
435 | if (!is_null($this->nodes)) { |
||||||
0 ignored issues
–
show
|
|||||||
436 | foreach ($this->nodes as $n) { |
||||||
437 | $ret .= $this->convert_text($n->text()); |
||||||
438 | } |
||||||
439 | |||||||
440 | // If this node is a span... add a space at the end of it so multiple spans don't run into each other. This is plaintext after all. |
||||||
441 | if ($this->tag == "span") { |
||||||
442 | $ret .= $this->dom->default_span_text; |
||||||
443 | } |
||||||
444 | } |
||||||
445 | return $ret; |
||||||
446 | } |
||||||
447 | |||||||
448 | public function xmltext() |
||||||
449 | { |
||||||
450 | $ret = $this->innertext(); |
||||||
451 | $ret = str_ireplace('<![CDATA[', '', $ret); |
||||||
452 | $ret = str_replace(']]>', '', $ret); |
||||||
453 | return $ret; |
||||||
454 | } |
||||||
455 | |||||||
456 | // build node's text with tag |
||||||
457 | public function makeup() |
||||||
458 | { |
||||||
459 | // text, comment, unknown |
||||||
460 | if (isset($this->_[HDOM_INFO_TEXT])) { |
||||||
461 | return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); |
||||||
462 | } |
||||||
463 | |||||||
464 | $ret = '<'.$this->tag; |
||||||
465 | $i = -1; |
||||||
466 | |||||||
467 | foreach ($this->attr as $key=>$val) { |
||||||
468 | ++$i; |
||||||
469 | |||||||
470 | // skip removed attribute |
||||||
471 | if ($val===null || $val===false) { |
||||||
472 | continue; |
||||||
473 | } |
||||||
474 | |||||||
475 | $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; |
||||||
476 | //no value attr: nowrap, checked selected... |
||||||
477 | if ($val===true) { |
||||||
478 | $ret .= $key; |
||||||
479 | } else { |
||||||
480 | switch ($this->_[HDOM_INFO_QUOTE][$i]) { |
||||||
481 | case HDOM_QUOTE_DOUBLE: $quote = '"'; break; |
||||||
482 | case HDOM_QUOTE_SINGLE: $quote = '\''; break; |
||||||
483 | default: $quote = ''; |
||||||
484 | } |
||||||
485 | $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote; |
||||||
486 | } |
||||||
487 | } |
||||||
488 | $ret = $this->dom->restore_noise($ret); |
||||||
489 | return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; |
||||||
490 | } |
||||||
491 | |||||||
492 | // find elements by css selector |
||||||
493 | //PaperG - added ability for find to lowercase the value of the selector. |
||||||
494 | public function find($selector, $idx=null, $lowercase=false) |
||||||
495 | { |
||||||
496 | $selectors = $this->parse_selector($selector); |
||||||
497 | if (($count=count($selectors))===0) { |
||||||
498 | return []; |
||||||
499 | } |
||||||
500 | $found_keys = []; |
||||||
501 | |||||||
502 | // find each selector |
||||||
503 | for ($c=0; $c<$count; ++$c) { |
||||||
504 | // The change on the below line was documented on the sourceforge code tracker id 2788009 |
||||||
505 | // used to be: if (($levle=count($selectors[0]))===0) return []; |
||||||
506 | if (($levle=count($selectors[$c]))===0) { |
||||||
507 | return []; |
||||||
508 | } |
||||||
509 | if (!isset($this->_[HDOM_INFO_BEGIN])) { |
||||||
510 | return []; |
||||||
511 | } |
||||||
512 | |||||||
513 | $head = array($this->_[HDOM_INFO_BEGIN]=>1); |
||||||
514 | |||||||
515 | // handle descendant selectors, no recursive! |
||||||
516 | for ($l=0; $l<$levle; ++$l) { |
||||||
517 | $ret = []; |
||||||
518 | foreach ($head as $k=>$v) { |
||||||
519 | $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k]; |
||||||
520 | //PaperG - Pass this optional parameter on to the seek function. |
||||||
521 | $n->seek($selectors[$c][$l], $ret, $lowercase); |
||||||
522 | } |
||||||
523 | $head = $ret; |
||||||
524 | } |
||||||
525 | |||||||
526 | foreach ($head as $k=>$v) { |
||||||
527 | if (!isset($found_keys[$k])) { |
||||||
528 | $found_keys[$k] = 1; |
||||||
529 | } |
||||||
530 | } |
||||||
531 | } |
||||||
532 | |||||||
533 | // sort keys |
||||||
534 | ksort($found_keys); |
||||||
535 | |||||||
536 | $found = []; |
||||||
537 | foreach ($found_keys as $k=>$v) { |
||||||
538 | $found[] = $this->dom->nodes[$k]; |
||||||
539 | } |
||||||
540 | |||||||
541 | // return nth-element or array |
||||||
542 | if (is_null($idx)) { |
||||||
543 | return $found; |
||||||
544 | } elseif ($idx<0) { |
||||||
545 | $idx = count($found) + $idx; |
||||||
546 | } |
||||||
547 | return (isset($found[$idx])) ? $found[$idx] : null; |
||||||
548 | } |
||||||
549 | |||||||
550 | // seek for given conditions |
||||||
551 | // PaperG - added parameter to allow for case insensitive testing of the value of a selector. |
||||||
552 | protected function seek($selector, &$ret, $lowercase=false) |
||||||
553 | { |
||||||
554 | global $debug_object; |
||||||
555 | if (is_object($debug_object)) { |
||||||
556 | $debug_object->debugLogEntry(1); |
||||||
557 | } |
||||||
558 | |||||||
559 | list($tag, $key, $val, $exp, $no_key) = $selector; |
||||||
560 | |||||||
561 | // xpath index |
||||||
562 | if ($tag && $key && is_numeric($key)) { |
||||||
563 | $count = 0; |
||||||
564 | foreach ($this->children as $c) { |
||||||
565 | if ($tag==='*' || $tag===$c->tag) { |
||||||
566 | if (++$count==$key) { |
||||||
567 | $ret[$c->_[HDOM_INFO_BEGIN]] = 1; |
||||||
568 | return; |
||||||
569 | } |
||||||
570 | } |
||||||
571 | } |
||||||
572 | return; |
||||||
573 | } |
||||||
574 | |||||||
575 | $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; |
||||||
576 | if ($end==0) { |
||||||
577 | $parent = $this->parent; |
||||||
578 | while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) { |
||||||
579 | $end -= 1; |
||||||
580 | $parent = $parent->parent; |
||||||
581 | } |
||||||
582 | $end += $parent->_[HDOM_INFO_END]; |
||||||
583 | } |
||||||
584 | |||||||
585 | for ($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) { |
||||||
586 | $node = $this->dom->nodes[$i]; |
||||||
587 | |||||||
588 | $pass = true; |
||||||
589 | |||||||
590 | if ($tag==='*' && !$key) { |
||||||
591 | if (in_array($node, $this->children, true)) { |
||||||
592 | $ret[$i] = 1; |
||||||
593 | } |
||||||
594 | continue; |
||||||
595 | } |
||||||
596 | |||||||
597 | // compare tag |
||||||
598 | if ($tag && $tag!=$node->tag && $tag!=='*') { |
||||||
599 | $pass=false; |
||||||
600 | } |
||||||
601 | // compare key |
||||||
602 | if ($pass && $key) { |
||||||
603 | if ($no_key) { |
||||||
604 | if (isset($node->attr[$key])) { |
||||||
605 | $pass=false; |
||||||
606 | } |
||||||
607 | } else { |
||||||
608 | if (($key != "plaintext") && !isset($node->attr[$key])) { |
||||||
609 | $pass=false; |
||||||
610 | } |
||||||
611 | } |
||||||
612 | } |
||||||
613 | // compare value |
||||||
614 | if ($pass && $key && $val && $val!=='*') { |
||||||
615 | // If they have told us that this is a "plaintext" search then we want the plaintext of the node - right? |
||||||
616 | if ($key == "plaintext") { |
||||||
617 | // $node->plaintext actually returns $node->text(); |
||||||
618 | $nodeKeyValue = $node->text(); |
||||||
619 | } else { |
||||||
620 | // this is a normal search, we want the value of that attribute of the tag. |
||||||
621 | $nodeKeyValue = $node->attr[$key]; |
||||||
622 | } |
||||||
623 | if (is_object($debug_object)) { |
||||||
624 | $debug_object->debugLog(2, "testing node: " . $node->tag . " for attribute: " . $key . $exp . $val . " where nodes value is: " . $nodeKeyValue); |
||||||
625 | } |
||||||
626 | |||||||
627 | //PaperG - If lowercase is set, do a case insensitive test of the value of the selector. |
||||||
628 | if ($lowercase) { |
||||||
629 | $check = $this->match($exp, strtolower($val), strtolower($nodeKeyValue)); |
||||||
630 | } else { |
||||||
631 | $check = $this->match($exp, $val, $nodeKeyValue); |
||||||
632 | } |
||||||
633 | if (is_object($debug_object)) { |
||||||
634 | $debug_object->debugLog(2, "after match: " . ($check ? "true" : "false")); |
||||||
635 | } |
||||||
636 | |||||||
637 | // handle multiple class |
||||||
638 | if (!$check && strcasecmp($key, 'class')===0) { |
||||||
639 | foreach (explode(' ', $node->attr[$key]) as $k) { |
||||||
640 | // Without this, there were cases where leading, trailing, or double spaces lead to our comparing blanks - bad form. |
||||||
641 | if (!empty($k)) { |
||||||
642 | if ($lowercase) { |
||||||
643 | $check = $this->match($exp, strtolower($val), strtolower($k)); |
||||||
644 | } else { |
||||||
645 | $check = $this->match($exp, $val, $k); |
||||||
646 | } |
||||||
647 | if ($check) { |
||||||
648 | break; |
||||||
649 | } |
||||||
650 | } |
||||||
651 | } |
||||||
652 | } |
||||||
653 | if (!$check) { |
||||||
654 | $pass = false; |
||||||
655 | } |
||||||
656 | } |
||||||
657 | if ($pass) { |
||||||
658 | $ret[$i] = 1; |
||||||
659 | } |
||||||
660 | unset($node); |
||||||
661 | } |
||||||
662 | // It's passed by reference so this is actually what this function returns. |
||||||
663 | if (is_object($debug_object)) { |
||||||
664 | $debug_object->debugLog(1, "EXIT - ret: ", $ret); |
||||||
665 | } |
||||||
666 | } |
||||||
667 | |||||||
668 | protected function match($exp, $pattern, $value) |
||||||
669 | { |
||||||
670 | global $debug_object; |
||||||
671 | if (is_object($debug_object)) { |
||||||
672 | $debug_object->debugLogEntry(1); |
||||||
673 | } |
||||||
674 | |||||||
675 | switch ($exp) { |
||||||
676 | case '=': |
||||||
677 | return ($value===$pattern); |
||||||
678 | case '!=': |
||||||
679 | return ($value!==$pattern); |
||||||
680 | case '^=': |
||||||
681 | return preg_match("/^".preg_quote($pattern, '/')."/", $value); |
||||||
682 | case '$=': |
||||||
683 | return preg_match("/".preg_quote($pattern, '/')."$/", $value); |
||||||
684 | case '*=': |
||||||
685 | if ($pattern[0]=='/') { |
||||||
686 | return preg_match($pattern, $value); |
||||||
687 | } |
||||||
688 | return preg_match("/".$pattern."/i", $value); |
||||||
689 | } |
||||||
690 | return false; |
||||||
691 | } |
||||||
692 | |||||||
693 | protected function parse_selector($selector_string) |
||||||
694 | { |
||||||
695 | global $debug_object; |
||||||
696 | if (is_object($debug_object)) { |
||||||
697 | $debug_object->debugLogEntry(1); |
||||||
698 | } |
||||||
699 | |||||||
700 | // pattern of CSS selectors, modified from mootools |
||||||
701 | // Paperg: Add the colon to the attrbute, so that it properly finds <tag attr:ibute="something" > like google does. |
||||||
702 | // Note: if you try to look at this attribute, yo MUST use getAttribute since $dom->x:y will fail the php syntax check. |
||||||
703 | // Notice the \[ starting the attbute? and the @? following? This implies that an attribute can begin with an @ sign that is not captured. |
||||||
704 | // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression. |
||||||
705 | // farther study is required to determine of this should be documented or removed. |
||||||
706 | // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; |
||||||
707 | $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; |
||||||
708 | preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); |
||||||
709 | if (is_object($debug_object)) { |
||||||
710 | $debug_object->debugLog(2, "Matches Array: ", $matches); |
||||||
711 | } |
||||||
712 | |||||||
713 | $selectors = []; |
||||||
714 | $result = []; |
||||||
715 | //print_r($matches); |
||||||
716 | |||||||
717 | foreach ($matches as $m) { |
||||||
718 | $m[0] = trim($m[0]); |
||||||
719 | if ($m[0]==='' || $m[0]==='/' || $m[0]==='//') { |
||||||
720 | continue; |
||||||
721 | } |
||||||
722 | // for browser generated xpath |
||||||
723 | if ($m[1]==='tbody') { |
||||||
724 | continue; |
||||||
725 | } |
||||||
726 | |||||||
727 | list($tag, $key, $val, $exp, $no_key) = array($m[1], null, null, '=', false); |
||||||
728 | if (!empty($m[2])) { |
||||||
729 | $key='id'; |
||||||
730 | $val=$m[2]; |
||||||
731 | } |
||||||
732 | if (!empty($m[3])) { |
||||||
733 | $key='class'; |
||||||
734 | $val=$m[3]; |
||||||
735 | } |
||||||
736 | if (!empty($m[4])) { |
||||||
737 | $key=$m[4]; |
||||||
738 | } |
||||||
739 | if (!empty($m[5])) { |
||||||
740 | $exp=$m[5]; |
||||||
741 | } |
||||||
742 | if (!empty($m[6])) { |
||||||
743 | $val=$m[6]; |
||||||
744 | } |
||||||
745 | |||||||
746 | // convert to lowercase |
||||||
747 | if ($this->dom->lowercase) { |
||||||
748 | $tag=strtolower($tag); |
||||||
749 | $key=strtolower($key); |
||||||
750 | } |
||||||
751 | //elements that do NOT have the specified attribute |
||||||
752 | if (isset($key[0]) && $key[0]==='!') { |
||||||
753 | $key=substr($key, 1); |
||||||
754 | $no_key=true; |
||||||
755 | } |
||||||
756 | |||||||
757 | $result[] = array($tag, $key, $val, $exp, $no_key); |
||||||
758 | if (trim($m[7])===',') { |
||||||
759 | $selectors[] = $result; |
||||||
760 | $result = []; |
||||||
761 | } |
||||||
762 | } |
||||||
763 | if (count($result)>0) { |
||||||
764 | $selectors[] = $result; |
||||||
765 | } |
||||||
766 | return $selectors; |
||||||
767 | } |
||||||
768 | |||||||
769 | public function __get($name) |
||||||
770 | { |
||||||
771 | if (isset($this->attr[$name])) { |
||||||
772 | return $this->convert_text($this->attr[$name]); |
||||||
773 | } |
||||||
774 | switch ($name) { |
||||||
775 | case 'outertext': return $this->outertext(); |
||||||
776 | case 'innertext': return $this->innertext(); |
||||||
777 | case 'plaintext': return $this->text(); |
||||||
778 | case 'xmltext': return $this->xmltext(); |
||||||
779 | default: return array_key_exists($name, $this->attr); |
||||||
780 | } |
||||||
781 | } |
||||||
782 | |||||||
783 | public function __set($name, $value) |
||||||
784 | { |
||||||
785 | switch ($name) { |
||||||
786 | case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; |
||||||
787 | case 'innertext': |
||||||
788 | if (isset($this->_[HDOM_INFO_TEXT])) { |
||||||
789 | return $this->_[HDOM_INFO_TEXT] = $value; |
||||||
790 | } |
||||||
791 | return $this->_[HDOM_INFO_INNER] = $value; |
||||||
792 | } |
||||||
793 | if (!isset($this->attr[$name])) { |
||||||
794 | $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); |
||||||
795 | $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; |
||||||
796 | } |
||||||
797 | $this->attr[$name] = $value; |
||||||
798 | } |
||||||
799 | |||||||
800 | public function __isset($name) |
||||||
801 | { |
||||||
802 | switch ($name) { |
||||||
803 | case 'outertext': return true; |
||||||
804 | case 'innertext': return true; |
||||||
805 | case 'plaintext': return true; |
||||||
806 | } |
||||||
807 | //no value attr: nowrap, checked selected... |
||||||
808 | return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); |
||||||
809 | } |
||||||
810 | |||||||
811 | public function __unset($name) |
||||||
812 | { |
||||||
813 | if (isset($this->attr[$name])) { |
||||||
814 | unset($this->attr[$name]); |
||||||
815 | } |
||||||
816 | } |
||||||
817 | |||||||
818 | // PaperG - Function to convert the text from one character set to another if the two sets are not the same. |
||||||
819 | public function convert_text($text) |
||||||
820 | { |
||||||
821 | global $debug_object; |
||||||
822 | if (is_object($debug_object)) { |
||||||
823 | $debug_object->debugLogEntry(1); |
||||||
824 | } |
||||||
825 | |||||||
826 | $converted_text = $text; |
||||||
827 | |||||||
828 | $sourceCharset = ""; |
||||||
829 | $targetCharset = ""; |
||||||
830 | |||||||
831 | if ($this->dom) { |
||||||
832 | $sourceCharset = strtoupper($this->dom->_charset); |
||||||
833 | $targetCharset = strtoupper($this->dom->_target_charset); |
||||||
834 | } |
||||||
835 | if (is_object($debug_object)) { |
||||||
836 | $debug_object->debugLog(3, "source charset: " . $sourceCharset . " target charaset: " . $targetCharset); |
||||||
837 | } |
||||||
838 | |||||||
839 | if (!empty($sourceCharset) && !empty($targetCharset) && (strcasecmp($sourceCharset, $targetCharset) != 0)) { |
||||||
840 | // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 |
||||||
841 | if ((strcasecmp($targetCharset, 'UTF-8') == 0) && ($this->is_utf8($text))) { |
||||||
842 | $converted_text = $text; |
||||||
843 | } else { |
||||||
844 | $converted_text = iconv($sourceCharset, $targetCharset, $text); |
||||||
845 | } |
||||||
846 | } |
||||||
847 | |||||||
848 | // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. |
||||||
849 | if ($targetCharset == 'UTF-8') { |
||||||
850 | if (substr($converted_text, 0, 3) == "\xef\xbb\xbf") { |
||||||
851 | $converted_text = substr($converted_text, 3); |
||||||
852 | } |
||||||
853 | if (substr($converted_text, -3) == "\xef\xbb\xbf") { |
||||||
854 | $converted_text = substr($converted_text, 0, -3); |
||||||
855 | } |
||||||
856 | } |
||||||
857 | |||||||
858 | return $converted_text; |
||||||
859 | } |
||||||
860 | |||||||
861 | /** |
||||||
862 | * Returns true if $string is valid UTF-8 and false otherwise. |
||||||
863 | * |
||||||
864 | * @param mixed $str String to be tested |
||||||
865 | * @return boolean |
||||||
866 | */ |
||||||
867 | public static function is_utf8($str) |
||||||
868 | { |
||||||
869 | $c=0; |
||||||
0 ignored issues
–
show
|
|||||||
870 | $b=0; |
||||||
0 ignored issues
–
show
|
|||||||
871 | $bits=0; |
||||||
872 | $len=strlen($str); |
||||||
873 | for ($i=0; $i<$len; $i++) { |
||||||
874 | $c=ord($str[$i]); |
||||||
875 | if ($c > 128) { |
||||||
876 | if (($c >= 254)) { |
||||||
877 | return false; |
||||||
878 | } elseif ($c >= 252) { |
||||||
879 | $bits=6; |
||||||
880 | } elseif ($c >= 248) { |
||||||
881 | $bits=5; |
||||||
882 | } elseif ($c >= 240) { |
||||||
883 | $bits=4; |
||||||
884 | } elseif ($c >= 224) { |
||||||
885 | $bits=3; |
||||||
886 | } elseif ($c >= 192) { |
||||||
887 | $bits=2; |
||||||
888 | } else { |
||||||
889 | return false; |
||||||
890 | } |
||||||
891 | if (($i+$bits) > $len) { |
||||||
892 | return false; |
||||||
893 | } |
||||||
894 | while ($bits > 1) { |
||||||
895 | $i++; |
||||||
896 | $b=ord($str[$i]); |
||||||
897 | if ($b < 128 || $b > 191) { |
||||||
898 | return false; |
||||||
899 | } |
||||||
900 | $bits--; |
||||||
901 | } |
||||||
902 | } |
||||||
903 | } |
||||||
904 | return true; |
||||||
905 | } |
||||||
906 | /* |
||||||
907 | function is_utf8($string) |
||||||
908 | { |
||||||
909 | //this is buggy |
||||||
910 | return (utf8_encode(utf8_decode($string)) == $string); |
||||||
911 | } |
||||||
912 | */ |
||||||
913 | |||||||
914 | /** |
||||||
915 | * Function to try a few tricks to determine the displayed size of an img on the page. |
||||||
916 | * NOTE: This will ONLY work on an IMG tag. Returns FALSE on all other tag types. |
||||||
917 | * |
||||||
918 | * @author John Schlick |
||||||
919 | * @version April 19 2012 |
||||||
920 | * @return array an array containing the 'height' and 'width' of the image on the page or -1 if we can't figure it out. |
||||||
921 | */ |
||||||
922 | public function get_display_size() |
||||||
923 | { |
||||||
924 | global $debug_object; |
||||||
925 | |||||||
926 | $width = -1; |
||||||
927 | $height = -1; |
||||||
928 | |||||||
929 | if ($this->tag !== 'img') { |
||||||
930 | return false; |
||||||
931 | } |
||||||
932 | |||||||
933 | // See if there is aheight or width attribute in the tag itself. |
||||||
934 | if (isset($this->attr['width'])) { |
||||||
935 | $width = $this->attr['width']; |
||||||
936 | } |
||||||
937 | |||||||
938 | if (isset($this->attr['height'])) { |
||||||
939 | $height = $this->attr['height']; |
||||||
940 | } |
||||||
941 | |||||||
942 | // Now look for an inline style. |
||||||
943 | if (isset($this->attr['style'])) { |
||||||
944 | // Thanks to user gnarf from stackoverflow for this regular expression. |
||||||
945 | $attributes = []; |
||||||
946 | preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER); |
||||||
947 | foreach ($matches as $match) { |
||||||
948 | $attributes[$match[1]] = $match[2]; |
||||||
949 | } |
||||||
950 | |||||||
951 | // If there is a width in the style attributes: |
||||||
952 | if (isset($attributes['width']) && $width == -1) { |
||||||
953 | // check that the last two characters are px (pixels) |
||||||
954 | if (strtolower(substr($attributes['width'], -2)) == 'px') { |
||||||
955 | $proposed_width = substr($attributes['width'], 0, -2); |
||||||
956 | // Now make sure that it's an integer and not something stupid. |
||||||
957 | if (filter_var($proposed_width, FILTER_VALIDATE_INT)) { |
||||||
958 | $width = $proposed_width; |
||||||
959 | } |
||||||
960 | } |
||||||
961 | } |
||||||
962 | |||||||
963 | // If there is a width in the style attributes: |
||||||
964 | if (isset($attributes['height']) && $height == -1) { |
||||||
965 | // check that the last two characters are px (pixels) |
||||||
966 | if (strtolower(substr($attributes['height'], -2)) == 'px') { |
||||||
967 | $proposed_height = substr($attributes['height'], 0, -2); |
||||||
968 | // Now make sure that it's an integer and not something stupid. |
||||||
969 | if (filter_var($proposed_height, FILTER_VALIDATE_INT)) { |
||||||
970 | $height = $proposed_height; |
||||||
971 | } |
||||||
972 | } |
||||||
973 | } |
||||||
974 | } |
||||||
975 | |||||||
976 | // Future enhancement: |
||||||
977 | // Look in the tag to see if there is a class or id specified that has a height or width attribute to it. |
||||||
978 | |||||||
979 | // Far future enhancement |
||||||
980 | // Look at all the parent tags of this image to see if they specify a class or id that has an img selector that specifies a height or width |
||||||
981 | // Note that in this case, the class or id will have the img subselector for it to apply to the image. |
||||||
982 | |||||||
983 | // ridiculously far future development |
||||||
984 | // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page. |
||||||
985 | |||||||
986 | $result = array('height' => $height, |
||||||
987 | 'width' => $width); |
||||||
988 | return $result; |
||||||
989 | } |
||||||
990 | |||||||
991 | // camel naming conventions |
||||||
992 | public function getAllAttributes() |
||||||
993 | { |
||||||
994 | return $this->attr; |
||||||
995 | } |
||||||
996 | public function getAttribute($name) |
||||||
997 | { |
||||||
998 | return $this->__get($name); |
||||||
999 | } |
||||||
1000 | public function setAttribute($name, $value) |
||||||
1001 | { |
||||||
1002 | $this->__set($name, $value); |
||||||
1003 | } |
||||||
1004 | public function hasAttribute($name) |
||||||
1005 | { |
||||||
1006 | return $this->__isset($name); |
||||||
1007 | } |
||||||
1008 | public function removeAttribute($name) |
||||||
1009 | { |
||||||
1010 | $this->__set($name, null); |
||||||
1011 | } |
||||||
1012 | public function getElementById($id) |
||||||
1013 | { |
||||||
1014 | return $this->find("#$id", 0); |
||||||
1015 | } |
||||||
1016 | public function getElementsById($id, $idx=null) |
||||||
1017 | { |
||||||
1018 | return $this->find("#$id", $idx); |
||||||
1019 | } |
||||||
1020 | public function getElementByTagName($name) |
||||||
1021 | { |
||||||
1022 | return $this->find($name, 0); |
||||||
1023 | } |
||||||
1024 | public function getElementsByTagName($name, $idx=null) |
||||||
1025 | { |
||||||
1026 | return $this->find($name, $idx); |
||||||
1027 | } |
||||||
1028 | public function parentNode() |
||||||
1029 | { |
||||||
1030 | return $this->parent(); |
||||||
1031 | } |
||||||
1032 | public function childNodes($idx=-1) |
||||||
1033 | { |
||||||
1034 | return $this->children($idx); |
||||||
1035 | } |
||||||
1036 | public function firstChild() |
||||||
1037 | { |
||||||
1038 | return $this->first_child(); |
||||||
1039 | } |
||||||
1040 | public function lastChild() |
||||||
1041 | { |
||||||
1042 | return $this->last_child(); |
||||||
1043 | } |
||||||
1044 | public function nextSibling() |
||||||
1045 | { |
||||||
1046 | return $this->next_sibling(); |
||||||
1047 | } |
||||||
1048 | public function previousSibling() |
||||||
1049 | { |
||||||
1050 | return $this->prev_sibling(); |
||||||
1051 | } |
||||||
1052 | public function hasChildNodes() |
||||||
1053 | { |
||||||
1054 | return $this->has_child(); |
||||||
1055 | } |
||||||
1056 | public function nodeName() |
||||||
1057 | { |
||||||
1058 | return $this->tag; |
||||||
1059 | } |
||||||
1060 | public function appendChild($node) |
||||||
1061 | { |
||||||
1062 | $node->parent($this); |
||||||
1063 | return $node; |
||||||
1064 | } |
||||||
1065 | } |
||||||
1066 | |||||||
1067 | /** |
||||||
1068 | * simple html dom parser |
||||||
1069 | * Paperg - in the find routine: allow us to specify that we want case insensitive testing of the value of the selector. |
||||||
1070 | * Paperg - change $size from protected to public so we can easily access it |
||||||
1071 | * Paperg - added ForceTagsClosed in the constructor which tells us whether we trust the html or not. Default is to NOT trust it. |
||||||
1072 | * |
||||||
1073 | * @package PlaceLocalInclude |
||||||
1074 | */ |
||||||
1075 | class simple_html_dom |
||||||
1076 | { |
||||||
1077 | public $root = null; |
||||||
1078 | public $nodes = []; |
||||||
1079 | public $callback = null; |
||||||
1080 | public $lowercase = false; |
||||||
1081 | // Used to keep track of how large the text was when we started. |
||||||
1082 | public $original_size; |
||||||
1083 | public $size; |
||||||
1084 | protected $pos; |
||||||
1085 | protected $doc; |
||||||
1086 | protected $char; |
||||||
1087 | protected $cursor; |
||||||
1088 | protected $parent; |
||||||
1089 | protected $noise = []; |
||||||
1090 | protected $token_blank = " \t\r\n"; |
||||||
1091 | protected $token_equal = ' =/>'; |
||||||
1092 | protected $token_slash = " />\r\n\t"; |
||||||
1093 | protected $token_attr = ' >'; |
||||||
1094 | // Note that this is referenced by a child node, and so it needs to be public for that node to see this information. |
||||||
1095 | public $_charset = ''; |
||||||
1096 | public $_target_charset = ''; |
||||||
1097 | protected $default_br_text = ""; |
||||||
1098 | public $default_span_text = ""; |
||||||
1099 | |||||||
1100 | // use isset instead of in_array, performance boost about 30%... |
||||||
1101 | protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1); |
||||||
1102 | protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1); |
||||||
1103 | // Known sourceforge issue #2977341 |
||||||
1104 | // B tags that are not closed cause us to return everything to the end of the document. |
||||||
1105 | protected $optional_closing_tags = array( |
||||||
1106 | 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1), |
||||||
1107 | 'th'=>array('th'=>1), |
||||||
1108 | 'td'=>array('td'=>1), |
||||||
1109 | 'li'=>array('li'=>1), |
||||||
1110 | 'dt'=>array('dt'=>1, 'dd'=>1), |
||||||
1111 | 'dd'=>array('dd'=>1, 'dt'=>1), |
||||||
1112 | 'dl'=>array('dd'=>1, 'dt'=>1), |
||||||
1113 | 'p'=>array('p'=>1), |
||||||
1114 | 'nobr'=>array('nobr'=>1), |
||||||
1115 | 'b'=>array('b'=>1), |
||||||
1116 | 'option'=>array('option'=>1), |
||||||
1117 | ); |
||||||
1118 | |||||||
1119 | public function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) |
||||||
1120 | { |
||||||
1121 | if ($str) { |
||||||
1122 | if (preg_match("/^http:\/\//i", $str) || is_file($str)) { |
||||||
1123 | $this->load_file($str); |
||||||
1124 | } else { |
||||||
1125 | $this->load($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); |
||||||
1126 | } |
||||||
1127 | } |
||||||
1128 | // Forcing tags to be closed implies that we don't trust the html, but it can lead to parsing errors if we SHOULD trust the html. |
||||||
1129 | if (!$forceTagsClosed) { |
||||||
1130 | $this->optional_closing_array=[]; |
||||||
0 ignored issues
–
show
|
|||||||
1131 | } |
||||||
1132 | $this->_target_charset = $target_charset; |
||||||
1133 | } |
||||||
1134 | |||||||
1135 | public function __destruct() |
||||||
1136 | { |
||||||
1137 | $this->clear(); |
||||||
1138 | } |
||||||
1139 | |||||||
1140 | // load html from string |
||||||
1141 | public function load($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) |
||||||
1142 | { |
||||||
1143 | global $debug_object; |
||||||
1144 | |||||||
1145 | // prepare |
||||||
1146 | $this->prepare($str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText); |
||||||
1147 | // strip out comments |
||||||
1148 | $this->remove_noise("'<!--(.*?)-->'is"); |
||||||
1149 | // strip out cdata |
||||||
1150 | $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true); |
||||||
1151 | // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 |
||||||
1152 | // Script tags removal now preceeds style tag removal. |
||||||
1153 | // strip out <script> tags |
||||||
1154 | $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); |
||||||
1155 | $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); |
||||||
1156 | // strip out <style> tags |
||||||
1157 | $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); |
||||||
1158 | $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); |
||||||
1159 | // strip out preformatted tags |
||||||
1160 | $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is"); |
||||||
1161 | // strip out server side scripts |
||||||
1162 | $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); |
||||||
1163 | // strip smarty scripts |
||||||
1164 | $this->remove_noise("'(\{\w)(.*?)(\})'s", true); |
||||||
1165 | |||||||
1166 | // parsing |
||||||
1167 | while ($this->parse()); |
||||||
1168 | // end |
||||||
1169 | $this->root->_[HDOM_INFO_END] = $this->cursor; |
||||||
1170 | $this->parse_charset(); |
||||||
1171 | |||||||
1172 | // make load function chainable |
||||||
1173 | return $this; |
||||||
1174 | } |
||||||
1175 | |||||||
1176 | // load html from file |
||||||
1177 | public function load_file() |
||||||
1178 | { |
||||||
1179 | $args = func_get_args(); |
||||||
1180 | $this->load(call_user_func_array('file_get_contents', $args), true); |
||||||
1181 | // Throw an error if we can't properly load the dom. |
||||||
1182 | if (($error=error_get_last())!==null) { |
||||||
0 ignored issues
–
show
|
|||||||
1183 | $this->clear(); |
||||||
1184 | return false; |
||||||
1185 | } |
||||||
1186 | } |
||||||
1187 | |||||||
1188 | // set callback function |
||||||
1189 | public function set_callback($function_name) |
||||||
1190 | { |
||||||
1191 | $this->callback = $function_name; |
||||||
1192 | } |
||||||
1193 | |||||||
1194 | // remove callback function |
||||||
1195 | public function remove_callback() |
||||||
1196 | { |
||||||
1197 | $this->callback = null; |
||||||
1198 | } |
||||||
1199 | |||||||
1200 | // save dom as string |
||||||
1201 | public function save($filepath='') |
||||||
1202 | { |
||||||
1203 | $ret = $this->root->innertext(); |
||||||
1204 | if ($filepath!=='') { |
||||||
1205 | file_put_contents($filepath, $ret, LOCK_EX); |
||||||
1206 | } |
||||||
1207 | return $ret; |
||||||
1208 | } |
||||||
1209 | |||||||
1210 | // find dom node by css selector |
||||||
1211 | // Paperg - allow us to specify that we want case insensitive testing of the value of the selector. |
||||||
1212 | public function find($selector, $idx=null, $lowercase=false) |
||||||
1213 | { |
||||||
1214 | return $this->root->find($selector, $idx, $lowercase); |
||||||
1215 | } |
||||||
1216 | |||||||
1217 | // clean up memory due to php5 circular references memory leak... |
||||||
1218 | public function clear() |
||||||
1219 | { |
||||||
1220 | foreach ($this->nodes as $n) { |
||||||
1221 | $n->clear(); |
||||||
1222 | $n = null; |
||||||
0 ignored issues
–
show
|
|||||||
1223 | } |
||||||
1224 | // This add next line is documented in the sourceforge repository. 2977248 as a fix for ongoing memory leaks that occur even with the use of clear. |
||||||
1225 | if (isset($this->children)) { |
||||||
1226 | foreach ($this->children as $n) { |
||||||
1227 | $n->clear(); |
||||||
1228 | $n = null; |
||||||
1229 | } |
||||||
1230 | } |
||||||
1231 | if (isset($this->parent)) { |
||||||
1232 | $this->parent->clear(); |
||||||
1233 | unset($this->parent); |
||||||
1234 | } |
||||||
1235 | if (isset($this->root)) { |
||||||
1236 | $this->root->clear(); |
||||||
1237 | unset($this->root); |
||||||
1238 | } |
||||||
1239 | unset($this->doc); |
||||||
1240 | unset($this->noise); |
||||||
1241 | } |
||||||
1242 | |||||||
1243 | public function dump($show_attr=true) |
||||||
1244 | { |
||||||
1245 | $this->root->dump($show_attr); |
||||||
1246 | } |
||||||
1247 | |||||||
1248 | // prepare HTML data and init everything |
||||||
1249 | protected function prepare($str, $lowercase=true, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) |
||||||
1250 | { |
||||||
1251 | $this->clear(); |
||||||
1252 | |||||||
1253 | // set the length of content before we do anything to it. |
||||||
1254 | $this->size = strlen($str); |
||||||
1255 | // Save the original size of the html that we got in. It might be useful to someone. |
||||||
1256 | $this->original_size = $this->size; |
||||||
1257 | |||||||
1258 | //before we save the string as the doc... strip out the \r \n's if we are told to. |
||||||
1259 | if ($stripRN) { |
||||||
1260 | $str = str_replace("\r", " ", $str); |
||||||
1261 | $str = str_replace("\n", " ", $str); |
||||||
1262 | |||||||
1263 | // set the length of content since we have changed it. |
||||||
1264 | $this->size = strlen($str); |
||||||
1265 | } |
||||||
1266 | |||||||
1267 | $this->doc = $str; |
||||||
1268 | $this->pos = 0; |
||||||
1269 | $this->cursor = 1; |
||||||
1270 | $this->noise = []; |
||||||
1271 | $this->nodes = []; |
||||||
1272 | $this->lowercase = $lowercase; |
||||||
1273 | $this->default_br_text = $defaultBRText; |
||||||
1274 | $this->default_span_text = $defaultSpanText; |
||||||
1275 | $this->root = new simple_html_dom_node($this); |
||||||
1276 | $this->root->tag = 'root'; |
||||||
1277 | $this->root->_[HDOM_INFO_BEGIN] = -1; |
||||||
1278 | $this->root->nodetype = HDOM_TYPE_ROOT; |
||||||
1279 | $this->parent = $this->root; |
||||||
1280 | if ($this->size>0) { |
||||||
1281 | $this->char = $this->doc[0]; |
||||||
1282 | } |
||||||
1283 | } |
||||||
1284 | |||||||
1285 | // parse html content |
||||||
1286 | protected function parse() |
||||||
1287 | { |
||||||
1288 | if (($s = $this->copy_until_char('<'))==='') { |
||||||
1289 | return $this->read_tag(); |
||||||
1290 | } |
||||||
1291 | |||||||
1292 | // text |
||||||
1293 | $node = new simple_html_dom_node($this); |
||||||
1294 | ++$this->cursor; |
||||||
1295 | $node->_[HDOM_INFO_TEXT] = $s; |
||||||
1296 | $this->link_nodes($node, false); |
||||||
1297 | return true; |
||||||
1298 | } |
||||||
1299 | |||||||
1300 | // PAPERG - dkchou - added this to try to identify the character set of the page we have just parsed so we know better how to spit it out later. |
||||||
1301 | // NOTE: IF you provide a routine called get_last_retrieve_url_contents_content_type which returns the CURLINFO_CONTENT_TYPE from the last curl_exec |
||||||
1302 | // (or the content_type header from the last transfer), we will parse THAT, and if a charset is specified, we will use it over any other mechanism. |
||||||
1303 | protected function parse_charset() |
||||||
1304 | { |
||||||
1305 | global $debug_object; |
||||||
1306 | |||||||
1307 | $charset = null; |
||||||
1308 | |||||||
1309 | if (function_exists('get_last_retrieve_url_contents_content_type')) { |
||||||
1310 | $contentTypeHeader = get_last_retrieve_url_contents_content_type(); |
||||||
1311 | $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches); |
||||||
1312 | if ($success) { |
||||||
1313 | $charset = $matches[1]; |
||||||
1314 | if (is_object($debug_object)) { |
||||||
1315 | $debug_object->debugLog(2, 'header content-type found charset of: ' . $charset); |
||||||
1316 | } |
||||||
1317 | } |
||||||
1318 | } |
||||||
1319 | |||||||
1320 | if (empty($charset)) { |
||||||
1321 | $el = $this->root->find('meta[http-equiv=Content-Type]', 0); |
||||||
1322 | if (!empty($el)) { |
||||||
1323 | $fullvalue = $el->content; |
||||||
1324 | if (is_object($debug_object)) { |
||||||
1325 | $debug_object->debugLog(2, 'meta content-type tag found' . $fullvalue); |
||||||
1326 | } |
||||||
1327 | |||||||
1328 | if (!empty($fullvalue)) { |
||||||
1329 | $success = preg_match('/charset=(.+)/', $fullvalue, $matches); |
||||||
1330 | if ($success) { |
||||||
1331 | $charset = $matches[1]; |
||||||
1332 | } else { |
||||||
1333 | // If there is a meta tag, and they don't specify the character set, research says that it's typically ISO-8859-1 |
||||||
1334 | if (is_object($debug_object)) { |
||||||
1335 | $debug_object->debugLog(2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.'); |
||||||
1336 | } |
||||||
1337 | $charset = 'ISO-8859-1'; |
||||||
1338 | } |
||||||
1339 | } |
||||||
1340 | } |
||||||
1341 | } |
||||||
1342 | |||||||
1343 | // If we couldn't find a charset above, then lets try to detect one based on the text we got... |
||||||
1344 | if (empty($charset)) { |
||||||
1345 | // Have php try to detect the encoding from the text given to us. |
||||||
1346 | $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" )); |
||||||
1347 | if (is_object($debug_object)) { |
||||||
1348 | $debug_object->debugLog(2, 'mb_detect found: ' . $charset); |
||||||
1349 | } |
||||||
1350 | |||||||
1351 | // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need... |
||||||
1352 | if ($charset === false) { |
||||||
1353 | if (is_object($debug_object)) { |
||||||
1354 | $debug_object->debugLog(2, 'since mb_detect failed - using default of utf-8'); |
||||||
1355 | } |
||||||
1356 | $charset = 'UTF-8'; |
||||||
1357 | } |
||||||
1358 | } |
||||||
1359 | |||||||
1360 | // Since CP1252 is a superset, if we get one of it's subsets, we want it instead. |
||||||
1361 | if ((strtolower($charset) == strtolower('ISO-8859-1')) || (strtolower($charset) == strtolower('Latin1')) || (strtolower($charset) == strtolower('Latin-1'))) { |
||||||
1362 | if (is_object($debug_object)) { |
||||||
1363 | $debug_object->debugLog(2, 'replacing ' . $charset . ' with CP1252 as its a superset'); |
||||||
1364 | } |
||||||
1365 | $charset = 'CP1252'; |
||||||
1366 | } |
||||||
1367 | |||||||
1368 | if (is_object($debug_object)) { |
||||||
1369 | $debug_object->debugLog(1, 'EXIT - ' . $charset); |
||||||
1370 | } |
||||||
1371 | |||||||
1372 | return $this->_charset = $charset; |
||||||
1373 | } |
||||||
1374 | |||||||
1375 | // read tag info |
||||||
1376 | protected function read_tag() |
||||||
1377 | { |
||||||
1378 | if ($this->char!=='<') { |
||||||
1379 | $this->root->_[HDOM_INFO_END] = $this->cursor; |
||||||
1380 | return false; |
||||||
1381 | } |
||||||
1382 | $begin_tag_pos = $this->pos; |
||||||
1383 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||||||
1384 | |||||||
1385 | // end tag |
||||||
1386 | if ($this->char==='/') { |
||||||
1387 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||||||
1388 | // This represents the change in the simple_html_dom trunk from revision 180 to 181. |
||||||
1389 | // $this->skip($this->token_blank_t); |
||||||
1390 | $this->skip($this->token_blank); |
||||||
1391 | $tag = $this->copy_until_char('>'); |
||||||
1392 | |||||||
1393 | // skip attributes in end tag |
||||||
1394 | if (($pos = strpos($tag, ' '))!==false) { |
||||||
1395 | $tag = substr($tag, 0, $pos); |
||||||
1396 | } |
||||||
1397 | |||||||
1398 | $parent_lower = strtolower($this->parent->tag); |
||||||
1399 | $tag_lower = strtolower($tag); |
||||||
1400 | |||||||
1401 | if ($parent_lower!==$tag_lower) { |
||||||
1402 | if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) { |
||||||
1403 | $this->parent->_[HDOM_INFO_END] = 0; |
||||||
1404 | $org_parent = $this->parent; |
||||||
1405 | |||||||
1406 | while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) { |
||||||
1407 | $this->parent = $this->parent->parent; |
||||||
1408 | } |
||||||
1409 | |||||||
1410 | if (strtolower($this->parent->tag)!==$tag_lower) { |
||||||
1411 | $this->parent = $org_parent; // restore origonal parent |
||||||
1412 | if ($this->parent->parent) { |
||||||
1413 | $this->parent = $this->parent->parent; |
||||||
1414 | } |
||||||
1415 | $this->parent->_[HDOM_INFO_END] = $this->cursor; |
||||||
1416 | return $this->as_text_node($tag); |
||||||
1417 | } |
||||||
1418 | } elseif (($this->parent->parent) && isset($this->block_tags[$tag_lower])) { |
||||||
1419 | $this->parent->_[HDOM_INFO_END] = 0; |
||||||
1420 | $org_parent = $this->parent; |
||||||
1421 | |||||||
1422 | while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) { |
||||||
1423 | $this->parent = $this->parent->parent; |
||||||
1424 | } |
||||||
1425 | |||||||
1426 | if (strtolower($this->parent->tag)!==$tag_lower) { |
||||||
1427 | $this->parent = $org_parent; // restore origonal parent |
||||||
1428 | $this->parent->_[HDOM_INFO_END] = $this->cursor; |
||||||
1429 | return $this->as_text_node($tag); |
||||||
1430 | } |
||||||
1431 | } elseif (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower) { |
||||||
1432 | $this->parent->_[HDOM_INFO_END] = 0; |
||||||
1433 | $this->parent = $this->parent->parent; |
||||||
1434 | } else { |
||||||
1435 | return $this->as_text_node($tag); |
||||||
1436 | } |
||||||
1437 | } |
||||||
1438 | |||||||
1439 | $this->parent->_[HDOM_INFO_END] = $this->cursor; |
||||||
1440 | if ($this->parent->parent) { |
||||||
1441 | $this->parent = $this->parent->parent; |
||||||
1442 | } |
||||||
1443 | |||||||
1444 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||||||
1445 | return true; |
||||||
1446 | } |
||||||
1447 | |||||||
1448 | $node = new simple_html_dom_node($this); |
||||||
1449 | $node->_[HDOM_INFO_BEGIN] = $this->cursor; |
||||||
1450 | ++$this->cursor; |
||||||
1451 | $tag = $this->copy_until($this->token_slash); |
||||||
1452 | $node->tag_start = $begin_tag_pos; |
||||||
1453 | |||||||
1454 | // doctype, cdata & comments... |
||||||
1455 | if (isset($tag[0]) && $tag[0]==='!') { |
||||||
1456 | $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); |
||||||
1457 | |||||||
1458 | if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') { |
||||||
1459 | $node->nodetype = HDOM_TYPE_COMMENT; |
||||||
1460 | $node->tag = 'comment'; |
||||||
1461 | } else { |
||||||
1462 | $node->nodetype = HDOM_TYPE_UNKNOWN; |
||||||
1463 | $node->tag = 'unknown'; |
||||||
1464 | } |
||||||
1465 | if ($this->char==='>') { |
||||||
1466 | $node->_[HDOM_INFO_TEXT].='>'; |
||||||
1467 | } |
||||||
1468 | $this->link_nodes($node, true); |
||||||
1469 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||||||
1470 | return true; |
||||||
1471 | } |
||||||
1472 | |||||||
1473 | // text |
||||||
1474 | if ($pos=strpos($tag, '<')!==false) { |
||||||
0 ignored issues
–
show
|
|||||||
1475 | $tag = '<' . substr($tag, 0, -1); |
||||||
1476 | $node->_[HDOM_INFO_TEXT] = $tag; |
||||||
1477 | $this->link_nodes($node, false); |
||||||
1478 | $this->char = $this->doc[--$this->pos]; // prev |
||||||
1479 | return true; |
||||||
1480 | } |
||||||
1481 | |||||||
1482 | if (!preg_match("/^[\w-:]+$/", $tag)) { |
||||||
1483 | $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>'); |
||||||
1484 | if ($this->char==='<') { |
||||||
1485 | $this->link_nodes($node, false); |
||||||
1486 | return true; |
||||||
1487 | } |
||||||
1488 | |||||||
1489 | if ($this->char==='>') { |
||||||
1490 | $node->_[HDOM_INFO_TEXT].='>'; |
||||||
1491 | } |
||||||
1492 | $this->link_nodes($node, false); |
||||||
1493 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||||||
1494 | return true; |
||||||
1495 | } |
||||||
1496 | |||||||
1497 | // begin tag |
||||||
1498 | $node->nodetype = HDOM_TYPE_ELEMENT; |
||||||
1499 | $tag_lower = strtolower($tag); |
||||||
1500 | $node->tag = ($this->lowercase) ? $tag_lower : $tag; |
||||||
1501 | |||||||
1502 | // handle optional closing tags |
||||||
1503 | if (isset($this->optional_closing_tags[$tag_lower])) { |
||||||
1504 | while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { |
||||||
1505 | $this->parent->_[HDOM_INFO_END] = 0; |
||||||
1506 | $this->parent = $this->parent->parent; |
||||||
1507 | } |
||||||
1508 | $node->parent = $this->parent; |
||||||
1509 | } |
||||||
1510 | |||||||
1511 | $guard = 0; // prevent infinity loop |
||||||
1512 | $space = array($this->copy_skip($this->token_blank), '', ''); |
||||||
1513 | |||||||
1514 | // attributes |
||||||
1515 | do { |
||||||
1516 | if ($this->char!==null && $space[0]==='') { |
||||||
1517 | break; |
||||||
1518 | } |
||||||
1519 | $name = $this->copy_until($this->token_equal); |
||||||
1520 | if ($guard===$this->pos) { |
||||||
1521 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||||||
1522 | continue; |
||||||
1523 | } |
||||||
1524 | $guard = $this->pos; |
||||||
1525 | |||||||
1526 | // handle endless '<' |
||||||
1527 | if ($this->pos>=$this->size-1 && $this->char!=='>') { |
||||||
1528 | $node->nodetype = HDOM_TYPE_TEXT; |
||||||
1529 | $node->_[HDOM_INFO_END] = 0; |
||||||
1530 | $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name; |
||||||
1531 | $node->tag = 'text'; |
||||||
1532 | $this->link_nodes($node, false); |
||||||
1533 | return true; |
||||||
1534 | } |
||||||
1535 | |||||||
1536 | // handle mismatch '<' |
||||||
1537 | if ($this->doc[$this->pos-1]=='<') { |
||||||
1538 | $node->nodetype = HDOM_TYPE_TEXT; |
||||||
1539 | $node->tag = 'text'; |
||||||
1540 | $node->attr = []; |
||||||
1541 | $node->_[HDOM_INFO_END] = 0; |
||||||
1542 | $node->_[HDOM_INFO_TEXT] = substr($this->doc, $begin_tag_pos, $this->pos-$begin_tag_pos-1); |
||||||
1543 | $this->pos -= 2; |
||||||
1544 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||||||
1545 | $this->link_nodes($node, false); |
||||||
1546 | return true; |
||||||
1547 | } |
||||||
1548 | |||||||
1549 | if ($name!=='/' && $name!=='') { |
||||||
1550 | $space[1] = $this->copy_skip($this->token_blank); |
||||||
1551 | $name = $this->restore_noise($name); |
||||||
1552 | if ($this->lowercase) { |
||||||
1553 | $name = strtolower($name); |
||||||
1554 | } |
||||||
1555 | if ($this->char==='=') { |
||||||
1556 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||||||
1557 | $this->parse_attr($node, $name, $space); |
||||||
1558 | } else { |
||||||
1559 | //no value attr: nowrap, checked selected... |
||||||
1560 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; |
||||||
1561 | $node->attr[$name] = true; |
||||||
1562 | if ($this->char!='>') { |
||||||
1563 | $this->char = $this->doc[--$this->pos]; |
||||||
1564 | } // prev |
||||||
1565 | } |
||||||
1566 | $node->_[HDOM_INFO_SPACE][] = $space; |
||||||
1567 | $space = array($this->copy_skip($this->token_blank), '', ''); |
||||||
1568 | } else { |
||||||
1569 | break; |
||||||
1570 | } |
||||||
1571 | } while ($this->char!=='>' && $this->char!=='/'); |
||||||
1572 | |||||||
1573 | $this->link_nodes($node, true); |
||||||
1574 | $node->_[HDOM_INFO_ENDSPACE] = $space[0]; |
||||||
1575 | |||||||
1576 | // check self closing |
||||||
1577 | if ($this->copy_until_char_escape('>')==='/') { |
||||||
1578 | $node->_[HDOM_INFO_ENDSPACE] .= '/'; |
||||||
1579 | $node->_[HDOM_INFO_END] = 0; |
||||||
1580 | } else { |
||||||
1581 | // reset parent |
||||||
1582 | if (!isset($this->self_closing_tags[strtolower($node->tag)])) { |
||||||
1583 | $this->parent = $node; |
||||||
1584 | } |
||||||
1585 | } |
||||||
1586 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||||||
1587 | |||||||
1588 | // If it's a BR tag, we need to set it's text to the default text. |
||||||
1589 | // This way when we see it in plaintext, we can generate formatting that the user wants. |
||||||
1590 | // since a br tag never has sub nodes, this works well. |
||||||
1591 | if ($node->tag == "br") { |
||||||
1592 | $node->_[HDOM_INFO_INNER] = $this->default_br_text; |
||||||
1593 | } |
||||||
1594 | |||||||
1595 | return true; |
||||||
1596 | } |
||||||
1597 | |||||||
1598 | // parse attributes |
||||||
1599 | protected function parse_attr($node, $name, &$space) |
||||||
1600 | { |
||||||
1601 | // Per sourceforge: http://sourceforge.net/tracker/?func=detail&aid=3061408&group_id=218559&atid=1044037 |
||||||
1602 | // If the attribute is already defined inside a tag, only pay atetntion to the first one as opposed to the last one. |
||||||
1603 | if (isset($node->attr[$name])) { |
||||||
1604 | return; |
||||||
1605 | } |
||||||
1606 | |||||||
1607 | $space[2] = $this->copy_skip($this->token_blank); |
||||||
1608 | switch ($this->char) { |
||||||
1609 | case '"': |
||||||
1610 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; |
||||||
1611 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||||||
1612 | $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"')); |
||||||
1613 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||||||
1614 | break; |
||||||
1615 | case '\'': |
||||||
1616 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE; |
||||||
1617 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||||||
1618 | $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\'')); |
||||||
1619 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||||||
1620 | break; |
||||||
1621 | default: |
||||||
1622 | $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; |
||||||
1623 | $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr)); |
||||||
1624 | } |
||||||
1625 | // PaperG: Attributes should not have \r or \n in them, that counts as html whitespace. |
||||||
1626 | $node->attr[$name] = str_replace("\r", "", $node->attr[$name]); |
||||||
1627 | $node->attr[$name] = str_replace("\n", "", $node->attr[$name]); |
||||||
1628 | // PaperG: If this is a "class" selector, lets get rid of the preceeding and trailing space since some people leave it in the multi class case. |
||||||
1629 | if ($name == "class") { |
||||||
1630 | $node->attr[$name] = trim($node->attr[$name]); |
||||||
1631 | } |
||||||
1632 | } |
||||||
1633 | |||||||
1634 | // link node's parent |
||||||
1635 | protected function link_nodes(&$node, $is_child) |
||||||
1636 | { |
||||||
1637 | $node->parent = $this->parent; |
||||||
1638 | $this->parent->nodes[] = $node; |
||||||
1639 | if ($is_child) { |
||||||
1640 | $this->parent->children[] = $node; |
||||||
1641 | } |
||||||
1642 | } |
||||||
1643 | |||||||
1644 | // as a text node |
||||||
1645 | protected function as_text_node($tag) |
||||||
1646 | { |
||||||
1647 | $node = new simple_html_dom_node($this); |
||||||
1648 | ++$this->cursor; |
||||||
1649 | $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; |
||||||
1650 | $this->link_nodes($node, false); |
||||||
1651 | $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||||||
1652 | return true; |
||||||
1653 | } |
||||||
1654 | |||||||
1655 | protected function skip($chars) |
||||||
1656 | { |
||||||
1657 | $this->pos += strspn($this->doc, $chars, $this->pos); |
||||||
1658 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||||||
1659 | } |
||||||
1660 | |||||||
1661 | protected function copy_skip($chars) |
||||||
1662 | { |
||||||
1663 | $pos = $this->pos; |
||||||
1664 | $len = strspn($this->doc, $chars, $pos); |
||||||
1665 | $this->pos += $len; |
||||||
1666 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||||||
1667 | if ($len===0) { |
||||||
1668 | return ''; |
||||||
1669 | } |
||||||
1670 | return substr($this->doc, $pos, $len); |
||||||
1671 | } |
||||||
1672 | |||||||
1673 | protected function copy_until($chars) |
||||||
1674 | { |
||||||
1675 | $pos = $this->pos; |
||||||
1676 | $len = strcspn($this->doc, $chars, $pos); |
||||||
1677 | $this->pos += $len; |
||||||
1678 | $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next |
||||||
1679 | return substr($this->doc, $pos, $len); |
||||||
1680 | } |
||||||
1681 | |||||||
1682 | protected function copy_until_char($char) |
||||||
1683 | { |
||||||
1684 | if ($this->char===null) { |
||||||
1685 | return ''; |
||||||
1686 | } |
||||||
1687 | |||||||
1688 | if (($pos = strpos($this->doc, $char, $this->pos))===false) { |
||||||
1689 | $ret = substr($this->doc, $this->pos, $this->size-$this->pos); |
||||||
1690 | $this->char = null; |
||||||
1691 | $this->pos = $this->size; |
||||||
1692 | return $ret; |
||||||
1693 | } |
||||||
1694 | |||||||
1695 | if ($pos===$this->pos) { |
||||||
1696 | return ''; |
||||||
1697 | } |
||||||
1698 | $pos_old = $this->pos; |
||||||
1699 | $this->char = $this->doc[$pos]; |
||||||
1700 | $this->pos = $pos; |
||||||
1701 | return substr($this->doc, $pos_old, $pos-$pos_old); |
||||||
1702 | } |
||||||
1703 | |||||||
1704 | protected function copy_until_char_escape($char) |
||||||
1705 | { |
||||||
1706 | if ($this->char===null) { |
||||||
1707 | return ''; |
||||||
1708 | } |
||||||
1709 | |||||||
1710 | $start = $this->pos; |
||||||
1711 | while (1) { |
||||||
1712 | if (($pos = strpos($this->doc, $char, $start))===false) { |
||||||
1713 | $ret = substr($this->doc, $this->pos, $this->size-$this->pos); |
||||||
1714 | $this->char = null; |
||||||
1715 | $this->pos = $this->size; |
||||||
1716 | return $ret; |
||||||
1717 | } |
||||||
1718 | |||||||
1719 | if ($pos===$this->pos) { |
||||||
1720 | return ''; |
||||||
1721 | } |
||||||
1722 | |||||||
1723 | if ($this->doc[$pos-1]==='\\') { |
||||||
1724 | $start = $pos+1; |
||||||
1725 | continue; |
||||||
1726 | } |
||||||
1727 | |||||||
1728 | $pos_old = $this->pos; |
||||||
1729 | $this->char = $this->doc[$pos]; |
||||||
1730 | $this->pos = $pos; |
||||||
1731 | return substr($this->doc, $pos_old, $pos-$pos_old); |
||||||
1732 | } |
||||||
1733 | } |
||||||
1734 | |||||||
1735 | // remove noise from html content |
||||||
1736 | // save the noise in the $this->noise array. |
||||||
1737 | protected function remove_noise($pattern, $remove_tag=false) |
||||||
1738 | { |
||||||
1739 | global $debug_object; |
||||||
1740 | if (is_object($debug_object)) { |
||||||
1741 | $debug_object->debugLogEntry(1); |
||||||
1742 | } |
||||||
1743 | |||||||
1744 | $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE); |
||||||
1745 | |||||||
1746 | for ($i=$count-1; $i>-1; --$i) { |
||||||
1747 | $key = '___noise___'.sprintf('% 5d', count($this->noise)+1000); |
||||||
1748 | if (is_object($debug_object)) { |
||||||
1749 | $debug_object->debugLog(2, 'key is: ' . $key); |
||||||
1750 | } |
||||||
1751 | $idx = ($remove_tag) ? 0 : 1; |
||||||
1752 | $this->noise[$key] = $matches[$i][$idx][0]; |
||||||
1753 | $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); |
||||||
1754 | } |
||||||
1755 | |||||||
1756 | // reset the length of content |
||||||
1757 | $this->size = strlen($this->doc); |
||||||
1758 | if ($this->size>0) { |
||||||
1759 | $this->char = $this->doc[0]; |
||||||
1760 | } |
||||||
1761 | } |
||||||
1762 | |||||||
1763 | // restore noise to html content |
||||||
1764 | public function restore_noise($text) |
||||||
1765 | { |
||||||
1766 | global $debug_object; |
||||||
1767 | if (is_object($debug_object)) { |
||||||
1768 | $debug_object->debugLogEntry(1); |
||||||
1769 | } |
||||||
1770 | |||||||
1771 | while (($pos=strpos($text, '___noise___'))!==false) { |
||||||
1772 | // Sometimes there is a broken piece of markup, and we don't GET the pos+11 etc... token which indicates a problem outside of us... |
||||||
1773 | if (strlen($text) > $pos+15) { |
||||||
1774 | $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13].$text[$pos+14].$text[$pos+15]; |
||||||
1775 | if (is_object($debug_object)) { |
||||||
1776 | $debug_object->debugLog(2, 'located key of: ' . $key); |
||||||
1777 | } |
||||||
1778 | |||||||
1779 | if (isset($this->noise[$key])) { |
||||||
1780 | $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+16); |
||||||
1781 | } else { |
||||||
1782 | // do this to prevent an infinite loop. |
||||||
1783 | $text = substr($text, 0, $pos).'UNDEFINED NOISE FOR KEY: '.$key . substr($text, $pos+16); |
||||||
1784 | } |
||||||
1785 | } else { |
||||||
1786 | // There is no valid key being given back to us... We must get rid of the ___noise___ or we will have a problem. |
||||||
1787 | $text = substr($text, 0, $pos).'NO NUMERIC NOISE KEY' . substr($text, $pos+11); |
||||||
1788 | } |
||||||
1789 | } |
||||||
1790 | return $text; |
||||||
1791 | } |
||||||
1792 | |||||||
1793 | // Sometimes we NEED one of the noise elements. |
||||||
1794 | public function search_noise($text) |
||||||
1795 | { |
||||||
1796 | global $debug_object; |
||||||
1797 | if (is_object($debug_object)) { |
||||||
1798 | $debug_object->debugLogEntry(1); |
||||||
1799 | } |
||||||
1800 | |||||||
1801 | foreach ($this->noise as $noiseElement) { |
||||||
1802 | if (strpos($noiseElement, $text)!==false) { |
||||||
1803 | return $noiseElement; |
||||||
1804 | } |
||||||
1805 | } |
||||||
1806 | } |
||||||
1807 | public function __toString() |
||||||
1808 | { |
||||||
1809 | return $this->root->innertext(); |
||||||
1810 | } |
||||||
1811 | |||||||
1812 | public function __get($name) |
||||||
1813 | { |
||||||
1814 | switch ($name) { |
||||||
1815 | case 'outertext': |
||||||
1816 | return $this->root->innertext(); |
||||||
1817 | case 'innertext': |
||||||
1818 | return $this->root->innertext(); |
||||||
1819 | case 'plaintext': |
||||||
1820 | return $this->root->text(); |
||||||
1821 | case 'charset': |
||||||
1822 | return $this->_charset; |
||||||
1823 | case 'target_charset': |
||||||
1824 | return $this->_target_charset; |
||||||
1825 | } |
||||||
1826 | } |
||||||
1827 | |||||||
1828 | // camel naming conventions |
||||||
1829 | public function childNodes($idx=-1) |
||||||
1830 | { |
||||||
1831 | return $this->root->childNodes($idx); |
||||||
1832 | } |
||||||
1833 | public function firstChild() |
||||||
1834 | { |
||||||
1835 | return $this->root->first_child(); |
||||||
1836 | } |
||||||
1837 | public function lastChild() |
||||||
1838 | { |
||||||
1839 | return $this->root->last_child(); |
||||||
1840 | } |
||||||
1841 | public function createElement($name, $value=null) |
||||||
1842 | { |
||||||
1843 | return @str_get_html("<$name>$value</$name>")->first_child(); |
||||||
0 ignored issues
–
show
The method
first_child() does not exist on SunnysideUp\ShareThis\simple_html_dom . Did you maybe mean firstChild() ?
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces. This is most likely a typographical error or the method has been renamed.
Loading history...
|
|||||||
1844 | } |
||||||
1845 | public function createTextNode($value) |
||||||
1846 | { |
||||||
1847 | return @end(str_get_html($value)->nodes); |
||||||
1848 | } |
||||||
1849 | public function getElementById($id) |
||||||
1850 | { |
||||||
1851 | return $this->find("#$id", 0); |
||||||
1852 | } |
||||||
1853 | public function getElementsById($id, $idx=null) |
||||||
1854 | { |
||||||
1855 | return $this->find("#$id", $idx); |
||||||
1856 | } |
||||||
1857 | public function getElementByTagName($name) |
||||||
1858 | { |
||||||
1859 | return $this->find($name, 0); |
||||||
1860 | } |
||||||
1861 | public function getElementsByTagName($name, $idx=-1) |
||||||
1862 | { |
||||||
1863 | return $this->find($name, $idx); |
||||||
1864 | } |
||||||
1865 | public function loadFile() |
||||||
1866 | { |
||||||
1867 | $args = func_get_args(); |
||||||
1868 | $this->load_file($args); |
||||||
1869 | } |
||||||
1870 | } |
||||||
1871 |
This check looks for parameters that have been defined for a function or method, but which are not used in the method body.