1 | <?php |
||
2 | |||
3 | /** |
||
4 | * The base class that defines the methods used to traverse an HTML DOM using |
||
5 | * either DOMDocument or simple_html_dom |
||
6 | * |
||
7 | * @package ElkArte Forum |
||
8 | * @copyright ElkArte Forum contributors |
||
9 | * @license BSD http://opensource.org/licenses/BSD-3-Clause (see accompanying LICENSE.txt file) |
||
10 | * |
||
11 | * @version 2.0 dev |
||
12 | * |
||
13 | */ |
||
14 | |||
15 | namespace ElkArte\Converters; |
||
16 | |||
17 | use ElkArte\Helper\Util; |
||
18 | |||
19 | /** |
||
20 | * Class AbstractDomParser |
||
21 | */ |
||
22 | abstract class AbstractDomParser |
||
23 | { |
||
24 | /** @var object The object that holds the dom */ |
||
25 | public $document; |
||
26 | |||
27 | /** @var bool If we are using the internal or external parser */ |
||
28 | public $internalParser; |
||
29 | |||
30 | /** @var string Line end character */ |
||
31 | public $line_end = "\n"; |
||
32 | |||
33 | /** @var string Line break character */ |
||
34 | public $line_break = " \n\n"; |
||
35 | |||
36 | /** @var int Wordwrap output, set to 0 to skip wrapping */ |
||
37 | public $body_width = 76; |
||
38 | |||
39 | /** |
||
40 | * For a given node, checks if it is anywhere nested inside a code block |
||
41 | * |
||
42 | * - Prevents converting anything that's inside a code block |
||
43 | * |
||
44 | * @param object $node |
||
45 | * |
||
46 | * @return bool |
||
47 | */ |
||
48 | public static function hasParentCode($node, $internalParser) |
||
49 | { |
||
50 | $parent = $internalParser ? $node->parentNode : $node->parentNode(); |
||
51 | while ($parent) |
||
52 | { |
||
53 | // Anywhere nested inside a code/pre block we don't render tags |
||
54 | if (in_array($internalParser ? $parent->nodeName : $parent->nodeName(), ['pre', 'code'])) |
||
55 | { |
||
56 | return true; |
||
57 | } |
||
58 | |||
59 | // Back out another level, until we are done |
||
60 | $parent = $internalParser ? $parent->parentNode : $parent->parentNode(); |
||
61 | } |
||
62 | |||
63 | return false; |
||
64 | } |
||
65 | |||
66 | /** |
||
67 | * Set the DOM parser for class, loads the supplied HTML |
||
68 | */ |
||
69 | public function setParser() |
||
70 | { |
||
71 | $this->internalParser = true; |
||
72 | |||
73 | // PHP built-in function not available? |
||
74 | if (!class_exists('\\DOMDocument')) |
||
75 | { |
||
76 | $this->internalParser = false; |
||
77 | require_once(EXTDIR . '/simple_html_dom.php'); |
||
78 | } |
||
79 | } |
||
80 | |||
81 | /** |
||
82 | * Loads a string of HTML into the parser for processing |
||
83 | * |
||
84 | * @param string $html |
||
85 | */ |
||
86 | public function loadHTML($html) |
||
87 | { |
||
88 | if ($this->internalParser) |
||
89 | { |
||
90 | // Set up basic parameters for DomDocument, including silencing structural errors |
||
91 | $current = libxml_use_internal_errors(true); |
||
92 | |||
93 | // Just the body text, we will wrap it with our own html/head/body to ensure proper loading |
||
94 | $html = $this->getBodyText($html); |
||
95 | |||
96 | // Set up processing details |
||
97 | $this->document = new \DOMDocument(); |
||
98 | $this->document->preserveWhiteSpace = false; |
||
99 | $this->document->encoding = 'UTF-8'; |
||
100 | $this->document->loadHTML('<?xml encoding="UTF-8"><html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/></head><body>' . $html . '</body></html>'); |
||
101 | |||
102 | // Set the error handle back, clear any errors |
||
103 | libxml_use_internal_errors($current); |
||
104 | libxml_clear_errors(); |
||
105 | } |
||
106 | // Or using the external simple html parser |
||
107 | else |
||
108 | { |
||
109 | $this->document = str_get_html($html, true, true, 'UTF-8', false); |
||
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||
110 | } |
||
111 | } |
||
112 | |||
113 | /** |
||
114 | * Returns just the body of a html document such that we are not dealing with head |
||
115 | * and any above head markup. multipart/mixed may have multiple sections that we concatenate |
||
116 | * |
||
117 | * @param $text |
||
118 | * |
||
119 | * @return string |
||
120 | */ |
||
121 | public function getBodyText($text) |
||
122 | { |
||
123 | if (preg_match_all('~<body[^>]*?>(.*?)</body>~su', $text, $bodies)) |
||
124 | { |
||
125 | return implode("\n", $bodies[1]); |
||
126 | } |
||
127 | |||
128 | if (preg_match_all('~<html[^>]*?>(.*)</html>~su', $text, $bodies)) |
||
129 | { |
||
130 | return implode("\n", $bodies[1]); |
||
131 | } |
||
132 | |||
133 | // Parsers may have clipped the ending body or html tag off with the quote/signature |
||
134 | if (preg_match('~<body[^>]*?>(.*)~su', $text, $bodies)) |
||
135 | { |
||
136 | return $bodies[1]; |
||
137 | } |
||
138 | |||
139 | return $text; |
||
140 | } |
||
141 | |||
142 | /** |
||
143 | * Returns just the body of a dom object such that we are not dealing with head |
||
144 | * and any above head markup |
||
145 | * |
||
146 | * @return object |
||
147 | */ |
||
148 | public function getDOMBodyNode() |
||
149 | { |
||
150 | // First remove any head node |
||
151 | $this->_removeHeadNode(); |
||
152 | |||
153 | // The body of the HTML is where it's at. |
||
154 | if ($this->internalParser) |
||
155 | { |
||
156 | // Remove comments |
||
157 | $xpath = new \DOMXPath($this->document); |
||
158 | foreach ($xpath->query('//comment()') as $comment) |
||
159 | { |
||
160 | $comment->parentNode->removeChild($comment); |
||
161 | } |
||
162 | |||
163 | return $xpath->query('//body')->item(0); |
||
164 | } |
||
165 | |||
166 | return $this->document->find('body', 0) ?? $this->document->find('html', 0) ?? $this->document->root; |
||
167 | } |
||
168 | |||
169 | /** |
||
170 | * Remove any <head node from the DOM |
||
171 | * |
||
172 | * This is done due to poor structure of some received HTML via email ect |
||
173 | */ |
||
174 | private function _removeHeadNode() |
||
175 | { |
||
176 | $head = ($this->internalParser) ? $this->document->getElementsByTagName('head')->item(0) : $this->document->find('head', 0); |
||
177 | |||
178 | if ($head !== null) |
||
179 | { |
||
180 | if ($this->internalParser) |
||
181 | { |
||
182 | $head->parentNode->removeChild($head); |
||
183 | } |
||
184 | else |
||
185 | { |
||
186 | $this->document->find('head', 0)->outertext = ''; |
||
187 | } |
||
188 | } |
||
189 | } |
||
190 | |||
191 | /** |
||
192 | * Breaks a string up so its no more than width characters long |
||
193 | * |
||
194 | * - Will break at word boundaries |
||
195 | * - If no natural space is found will break mid-word |
||
196 | * |
||
197 | * @param string $string |
||
198 | * @param int $width |
||
199 | * @param string $break |
||
200 | * @return string |
||
201 | */ |
||
202 | public function utf8Wordwrap($string, $width = 76, $break = "\n") |
||
203 | { |
||
204 | if ($width < 76) |
||
205 | { |
||
206 | return $string; |
||
207 | } |
||
208 | |||
209 | $strings = explode($break, $string); |
||
210 | $lines = []; |
||
211 | |||
212 | foreach ($strings as $string) |
||
0 ignored issues
–
show
|
|||
213 | { |
||
214 | $in_quote = isset($string[0]) && $string[0] === '>'; |
||
215 | if (empty($string)) |
||
216 | { |
||
217 | $lines[] = ''; |
||
218 | } |
||
219 | |||
220 | while (!empty($string)) |
||
221 | { |
||
222 | // Get the next #width characters before a break (space, punctuation tab etc) |
||
223 | if (preg_match('~^(.{1,' . $width . '})(?:\s|$|,|\.)~u', $string, $matches)) |
||
224 | { |
||
225 | // Add the #width to the output and set up for the next pass |
||
226 | $lines[] = ($in_quote && $matches[1][0] !== '>' ? '> ' : '') . $matches[1]; |
||
227 | $string = Util::substr($string, Util::strlen($matches[1])); |
||
228 | } |
||
229 | // Humm just a long word with no place to break, so we simply cut it after width characters |
||
230 | else |
||
231 | { |
||
232 | $lines[] = ($in_quote && $string[0] !== '>' ? '> ' : '') . Util::substr($string, 0, $width); |
||
233 | $string = Util::substr($string, $width); |
||
234 | } |
||
235 | } |
||
236 | } |
||
237 | |||
238 | // Join it all the shortened sections up on our break characters |
||
239 | return implode($break, $lines); |
||
240 | } |
||
241 | |||
242 | /** |
||
243 | * Get the nesting level when inside a list |
||
244 | * |
||
245 | * @param object $node |
||
246 | * |
||
247 | * @return int |
||
248 | */ |
||
249 | public function hasParentList($node) |
||
250 | { |
||
251 | $depth = 0; |
||
252 | |||
253 | $parent = $this->getParent($node); |
||
254 | while ($parent) |
||
255 | { |
||
256 | // Anywhere nested inside a list we need to get the depth |
||
257 | $tag = $this->getName($parent); |
||
258 | if (in_array($tag, ['ul', 'ol'])) |
||
259 | { |
||
260 | $depth++; |
||
261 | } |
||
262 | |||
263 | // Back out another level |
||
264 | $parent = $this->getParent($parent); |
||
265 | } |
||
266 | |||
267 | return $depth; |
||
268 | } |
||
269 | |||
270 | /** |
||
271 | * Returns the parent node of another node |
||
272 | * |
||
273 | * @param $node |
||
274 | * @return object |
||
275 | */ |
||
276 | public function getParent($node) |
||
277 | { |
||
278 | if ($node === null) |
||
279 | { |
||
280 | return null; |
||
281 | } |
||
282 | |||
283 | return $this->internalParser ? $node->parentNode : $node->parentNode(); |
||
284 | } |
||
285 | |||
286 | /** |
||
287 | * Returns the node Name of a node |
||
288 | * |
||
289 | * @param $node |
||
290 | * @return string |
||
291 | */ |
||
292 | public function getName($node) |
||
293 | { |
||
294 | if ($node === null) |
||
295 | { |
||
296 | return ''; |
||
297 | } |
||
298 | |||
299 | return $this->internalParser ? $node->nodeName : $node->nodeName(); |
||
300 | } |
||
301 | |||
302 | /** |
||
303 | * Returns the HTML of the document |
||
304 | * |
||
305 | * @return string |
||
306 | */ |
||
307 | public function getHTML() |
||
308 | { |
||
309 | if ($this->internalParser) |
||
310 | { |
||
311 | return html_entity_decode(htmlspecialchars_decode($this->document->saveHTML(), ENT_QUOTES), ENT_QUOTES, 'UTF-8'); |
||
312 | } |
||
313 | |||
314 | return $this->document->save(); |
||
315 | } |
||
316 | |||
317 | /** |
||
318 | * Gets a node object |
||
319 | * |
||
320 | * @param object $node |
||
321 | * @param int $item |
||
322 | * @return object |
||
323 | */ |
||
324 | public function getItem($node, $item) |
||
325 | { |
||
326 | return $this->internalParser ? $node->item($item) : $node[$item]; |
||
327 | } |
||
328 | |||
329 | /** |
||
330 | * gets a node length |
||
331 | * |
||
332 | * @param object|array $node |
||
333 | * @return int |
||
334 | */ |
||
335 | public function getLength($node) |
||
336 | { |
||
337 | return $this->internalParser ? $node->length : count($node); |
||
338 | } |
||
339 | |||
340 | /** |
||
341 | * gets all children of a parent node |
||
342 | * |
||
343 | * @param object|array $node |
||
344 | * @return object |
||
345 | */ |
||
346 | public function getChildren($node) |
||
347 | { |
||
348 | return $this->internalParser ? $node->childNodes : $node->childNodes(); |
||
349 | } |
||
350 | |||
351 | /** |
||
352 | * gets a specific child of a parent node |
||
353 | * |
||
354 | * @param object|array $node |
||
355 | * @param int child number to return |
||
0 ignored issues
–
show
The type
ElkArte\Converters\child was not found. Maybe you did not declare it correctly or list all dependencies?
The issue could also be caused by a filter entry in the build configuration.
If the path has been excluded in your configuration, e.g. filter:
dependency_paths: ["lib/*"]
For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths ![]() |
|||
356 | * @return object |
||
357 | */ |
||
358 | public function getChild($node, $child) |
||
359 | { |
||
360 | return $this->internalParser ? $node->childNodes->item($child) : $node->childNodes($child); |
||
361 | } |
||
362 | |||
363 | /** |
||
364 | * gets the next sibling of a node |
||
365 | * |
||
366 | * @param object|array $node |
||
367 | * @return object |
||
368 | */ |
||
369 | public function getSibling($node) |
||
370 | { |
||
371 | return $this->internalParser ? $node->nextSibling : $node->next_sibling(); |
||
372 | } |
||
373 | |||
374 | /** |
||
375 | * gets a node value |
||
376 | * |
||
377 | * @param object $node |
||
378 | * @return string |
||
379 | */ |
||
380 | public function getValue($node) |
||
381 | { |
||
382 | if ($node === null) |
||
383 | { |
||
384 | return ''; |
||
385 | } |
||
386 | |||
387 | if ($this->internalParser) |
||
388 | { |
||
389 | return $node->nodeValue; |
||
390 | } |
||
391 | |||
392 | return html_entity_decode(htmlspecialchars_decode($node->innertext, ENT_QUOTES), ENT_QUOTES, 'UTF-8'); |
||
393 | } |
||
394 | |||
395 | /** |
||
396 | * Sets a node to a text value, replacing what was there |
||
397 | * |
||
398 | * @param $node |
||
399 | * @param $text |
||
400 | */ |
||
401 | public function setTextNode($node, $text) |
||
402 | { |
||
403 | if ($this->internalParser) |
||
404 | { |
||
405 | $text_node = $this->document->createTextNode($text); |
||
406 | $node->parentNode->replaceChild($text_node, $node); |
||
407 | } |
||
408 | else |
||
409 | { |
||
410 | $node->outertext = $text; |
||
411 | } |
||
412 | } |
||
413 | |||
414 | /** |
||
415 | * Gets the inner html of a node |
||
416 | * |
||
417 | * @param \DOMNode|object $node |
||
418 | * @return string |
||
419 | */ |
||
420 | public function getInnerHTML($node) |
||
421 | { |
||
422 | if ($this->internalParser) |
||
423 | { |
||
424 | $doc = new \DOMDocument(); |
||
425 | $doc->preserveWhiteSpace = true; |
||
426 | $doc->appendChild($doc->importNode($node, true)); |
||
427 | $html = trim($doc->saveHTML()); |
||
428 | $tag = $node->nodeName; |
||
429 | |||
430 | return preg_replace('@^<' . $tag . '[^>]*>|</' . $tag . '>$@', '', $html); |
||
431 | } |
||
432 | |||
433 | return $node->innertext; |
||
0 ignored issues
–
show
|
|||
434 | } |
||
435 | |||
436 | /** |
||
437 | * Gets the outer html of a node |
||
438 | * |
||
439 | * @param \DOMNode|object $node |
||
440 | * @return string |
||
441 | */ |
||
442 | public function getOuterHTML($node) |
||
443 | { |
||
444 | return $this->internalParser ? htmlspecialchars_decode($this->document->saveHTML($node)) : $node->outertext; |
||
0 ignored issues
–
show
|
|||
445 | } |
||
446 | |||
447 | /** |
||
448 | * Gets the inner html of a node |
||
449 | * |
||
450 | * @param \DOMNode|object $node |
||
451 | * @return string |
||
452 | */ |
||
453 | public function setInnerHTML($node) |
||
454 | { |
||
455 | if ($this->internalParser) |
||
456 | { |
||
457 | $doc = new \DOMDocument(); |
||
458 | $doc->appendChild($doc->importNode($node, true)); |
||
459 | $html = trim($doc->saveHTML()); |
||
460 | $tag = $node->nodeName; |
||
461 | |||
462 | return preg_replace('@^<' . $tag . '[^>]*>|</' . $tag . '>$@', '', $html); |
||
463 | } |
||
464 | |||
465 | return $node->innertext; |
||
0 ignored issues
–
show
|
|||
466 | } |
||
467 | } |