Issues (1686)

sources/ElkArte/Converters/AbstractDomParser.php (6 issues)

Labels
1
<?php
2
3
/**
4
 * The base class that defines the methods used to traverse an HTML DOM using
5
 * either DOMDocument or simple_html_dom
6
 *
7
 * @package   ElkArte Forum
8
 * @copyright ElkArte Forum contributors
9
 * @license   BSD http://opensource.org/licenses/BSD-3-Clause (see accompanying LICENSE.txt file)
10
 *
11
 * @version 2.0 dev
12
 *
13
 */
14
15
namespace ElkArte\Converters;
16
17
use ElkArte\Helper\Util;
18
19
/**
20
 * Class AbstractDomParser
21
 */
22
abstract class AbstractDomParser
23
{
24
	/** @var object The object that holds the dom */
25
	public $document;
26
27
	/** @var bool If we are using the internal or external parser */
28
	public $internalParser;
29
30
	/** @var string Line end character */
31
	public $line_end = "\n";
32
33
	/** @var string Line break character */
34
	public $line_break = "  \n\n";
35
36
	/** @var int Wordwrap output, set to 0 to skip wrapping */
37
	public $body_width = 76;
38
39
	/**
40
	 * For a given node, checks if it is anywhere nested inside a code block
41
	 *
42
	 *  - Prevents converting anything that's inside a code block
43
	 *
44
	 * @param object $node
45
	 *
46
	 * @return bool
47
	 */
48
	public static function hasParentCode($node, $internalParser)
49
	{
50
		$parent = $internalParser ? $node->parentNode : $node->parentNode();
51
		while ($parent)
52
		{
53
			// Anywhere nested inside a code/pre block we don't render tags
54
			if (in_array($internalParser ? $parent->nodeName : $parent->nodeName(), ['pre', 'code']))
55
			{
56
				return true;
57
			}
58
59
			// Back out another level, until we are done
60
			$parent = $internalParser ? $parent->parentNode : $parent->parentNode();
61
		}
62
63
		return false;
64
	}
65
66
	/**
67
	 * Set the DOM parser for class, loads the supplied HTML
68
	 */
69
	public function setParser()
70
	{
71
		$this->internalParser = true;
72
73
		// PHP built-in function not available?
74
		if (!class_exists('\\DOMDocument'))
75
		{
76
			$this->internalParser = false;
77
			require_once(EXTDIR . '/simple_html_dom.php');
78
		}
79
	}
80
81
	/**
82
	 * Loads a string of HTML into the parser for processing
83
	 *
84
	 * @param string $html
85
	 */
86
	public function loadHTML($html)
87
	{
88
		if ($this->internalParser)
89
		{
90
			// Set up basic parameters for DomDocument, including silencing structural errors
91
			$current = libxml_use_internal_errors(true);
92
93
			// Just the body text, we will wrap it with our own html/head/body to ensure proper loading
94
			$html = $this->getBodyText($html);
95
96
			// Set up processing details
97
			$this->document = new \DOMDocument();
98
			$this->document->preserveWhiteSpace = false;
99
			$this->document->encoding = 'UTF-8';
100
			$this->document->loadHTML('<?xml encoding="UTF-8"><html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/></head><body>' . $html . '</body></html>');
101
102
			// Set the error handle back, clear any errors
103
			libxml_use_internal_errors($current);
104
			libxml_clear_errors();
105
		}
106
		// Or using the external simple html parser
107
		else
108
		{
109
			$this->document = str_get_html($html, true, true, 'UTF-8', false);
0 ignored issues
show
The function str_get_html was not found. Maybe you did not declare it correctly or list all dependencies? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

109
			$this->document = /** @scrutinizer ignore-call */ str_get_html($html, true, true, 'UTF-8', false);
Loading history...
110
		}
111
	}
112
113
	/**
114
	 * Returns just the body of a html document such that we are not dealing with head
115
	 * and any above head markup.  multipart/mixed may have multiple sections that we concatenate
116
	 *
117
	 * @param $text
118
	 *
119
	 * @return string
120
	 */
121
	public function getBodyText($text)
122
	{
123
		if (preg_match_all('~<body[^>]*?>(.*?)</body>~su', $text, $bodies))
124
		{
125
			return implode("\n", $bodies[1]);
126
		}
127
128
		if (preg_match_all('~<html[^>]*?>(.*)</html>~su', $text, $bodies))
129
		{
130
			return implode("\n", $bodies[1]);
131
		}
132
133
		// Parsers may have clipped the ending body or html tag off with the quote/signature
134
		if (preg_match('~<body[^>]*?>(.*)~su', $text, $bodies))
135
		{
136
			return $bodies[1];
137
		}
138
139
		return $text;
140
	}
141
142
	/**
143
	 * Returns just the body of a dom object such that we are not dealing with head
144
	 * and any above head markup
145
	 *
146
	 * @return object
147
	 */
148
	public function getDOMBodyNode()
149
	{
150
		// First remove any head node
151
		$this->_removeHeadNode();
152
153
		// The body of the HTML is where it's at.
154
		if ($this->internalParser)
155
		{
156
			// Remove comments
157
			$xpath = new \DOMXPath($this->document);
158
			foreach ($xpath->query('//comment()') as $comment)
159
			{
160
				$comment->parentNode->removeChild($comment);
161
			}
162
163
			return $xpath->query('//body')->item(0);
164
		}
165
166
		return $this->document->find('body', 0) ?? $this->document->find('html', 0) ?? $this->document->root;
167
	}
168
169
	/**
170
	 * Remove any <head node from the DOM
171
	 *
172
	 * This is done due to poor structure of some received HTML via email ect
173
	 */
174
	private function _removeHeadNode()
175
	{
176
		$head = ($this->internalParser) ? $this->document->getElementsByTagName('head')->item(0) : $this->document->find('head', 0);
177
178
		if ($head !== null)
179
		{
180
			if ($this->internalParser)
181
			{
182
				$head->parentNode->removeChild($head);
183
			}
184
			else
185
			{
186
				$this->document->find('head', 0)->outertext = '';
187
			}
188
		}
189
	}
190
191
	/**
192
	 * Breaks a string up so its no more than width characters long
193
	 *
194
	 * - Will break at word boundaries
195
	 * - If no natural space is found will break mid-word
196
	 *
197
	 * @param string $string
198
	 * @param int $width
199
	 * @param string $break
200
	 * @return string
201
	 */
202
	public function utf8Wordwrap($string, $width = 76, $break = "\n")
203
	{
204
		if ($width < 76)
205
		{
206
			return $string;
207
		}
208
209
		$strings = explode($break, $string);
210
		$lines = [];
211
212
		foreach ($strings as $string)
0 ignored issues
show
$string is overwriting one of the parameters of this function.
Loading history...
213
		{
214
			$in_quote = isset($string[0]) && $string[0] === '>';
215
			if (empty($string))
216
			{
217
				$lines[] = '';
218
			}
219
220
			while (!empty($string))
221
			{
222
				// Get the next #width characters before a break (space, punctuation tab etc)
223
				if (preg_match('~^(.{1,' . $width . '})(?:\s|$|,|\.)~u', $string, $matches))
224
				{
225
					// Add the #width to the output and set up for the next pass
226
					$lines[] = ($in_quote && $matches[1][0] !== '>' ? '> ' : '') . $matches[1];
227
					$string = Util::substr($string, Util::strlen($matches[1]));
228
				}
229
				// Humm just a long word with no place to break, so we simply cut it after width characters
230
				else
231
				{
232
					$lines[] = ($in_quote && $string[0] !== '>' ? '> ' : '') . Util::substr($string, 0, $width);
233
					$string = Util::substr($string, $width);
234
				}
235
			}
236
		}
237
238
		// Join it all the shortened sections up on our break characters
239
		return implode($break, $lines);
240
	}
241
242
	/**
243
	 * Get the nesting level when inside a list
244
	 *
245
	 * @param object $node
246
	 *
247
	 * @return int
248
	 */
249
	public function hasParentList($node)
250
	{
251
		$depth = 0;
252
253
		$parent = $this->getParent($node);
254
		while ($parent)
255
		{
256
			// Anywhere nested inside a list we need to get the depth
257
			$tag = $this->getName($parent);
258
			if (in_array($tag, ['ul', 'ol']))
259
			{
260
				$depth++;
261
			}
262
263
			// Back out another level
264
			$parent = $this->getParent($parent);
265
		}
266
267
		return $depth;
268
	}
269
270
	/**
271
	 * Returns the parent node of another node
272
	 *
273
	 * @param $node
274
	 * @return object
275
	 */
276
	public function getParent($node)
277
	{
278
		if ($node === null)
279
		{
280
			return null;
281
		}
282
283
		return $this->internalParser ? $node->parentNode : $node->parentNode();
284
	}
285
286
	/**
287
	 * Returns the node Name of a node
288
	 *
289
	 * @param $node
290
	 * @return string
291
	 */
292
	public function getName($node)
293
	{
294
		if ($node === null)
295
		{
296
			return '';
297
		}
298
299
		return $this->internalParser ? $node->nodeName : $node->nodeName();
300
	}
301
302
	/**
303
	 * Returns the HTML of the document
304
	 *
305
	 * @return string
306
	 */
307
	public function getHTML()
308
	{
309
		if ($this->internalParser)
310
		{
311
			return html_entity_decode(htmlspecialchars_decode($this->document->saveHTML(), ENT_QUOTES), ENT_QUOTES, 'UTF-8');
312
		}
313
314
		return $this->document->save();
315
	}
316
317
	/**
318
	 * Gets a node object
319
	 *
320
	 * @param object $node
321
	 * @param int $item
322
	 * @return object
323
	 */
324
	public function getItem($node, $item)
325
	{
326
		return $this->internalParser ? $node->item($item) : $node[$item];
327
	}
328
329
	/**
330
	 * gets a node length
331
	 *
332
	 * @param object|array $node
333
	 * @return int
334
	 */
335
	public function getLength($node)
336
	{
337
		return $this->internalParser ? $node->length : count($node);
338
	}
339
340
	/**
341
	 * gets all children of a parent node
342
	 *
343
	 * @param object|array $node
344
	 * @return object
345
	 */
346
	public function getChildren($node)
347
	{
348
		return $this->internalParser ? $node->childNodes : $node->childNodes();
349
	}
350
351
	/**
352
	 * gets a specific child of a parent node
353
	 *
354
	 * @param object|array $node
355
	 * @param int child number to return
0 ignored issues
show
The type ElkArte\Converters\child was not found. Maybe you did not declare it correctly or list all dependencies?

The issue could also be caused by a filter entry in the build configuration. If the path has been excluded in your configuration, e.g. excluded_paths: ["lib/*"], you can move it to the dependency path list as follows:

filter:
    dependency_paths: ["lib/*"]

For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths

Loading history...
356
	 * @return object
357
	 */
358
	public function getChild($node, $child)
359
	{
360
		return $this->internalParser ? $node->childNodes->item($child) : $node->childNodes($child);
361
	}
362
363
	/**
364
	 * gets the next sibling of a node
365
	 *
366
	 * @param object|array $node
367
	 * @return object
368
	 */
369
	public function getSibling($node)
370
	{
371
		return $this->internalParser ? $node->nextSibling : $node->next_sibling();
372
	}
373
374
	/**
375
	 * gets a node value
376
	 *
377
	 * @param object $node
378
	 * @return string
379
	 */
380
	public function getValue($node)
381
	{
382
		if ($node === null)
383
		{
384
			return '';
385
		}
386
387
		if ($this->internalParser)
388
		{
389
			return $node->nodeValue;
390
		}
391
392
		return html_entity_decode(htmlspecialchars_decode($node->innertext, ENT_QUOTES), ENT_QUOTES, 'UTF-8');
393
	}
394
395
	/**
396
	 * Sets a node to a text value, replacing what was there
397
	 *
398
	 * @param $node
399
	 * @param $text
400
	 */
401
	public function setTextNode($node, $text)
402
	{
403
		if ($this->internalParser)
404
		{
405
			$text_node = $this->document->createTextNode($text);
406
			$node->parentNode->replaceChild($text_node, $node);
407
		}
408
		else
409
		{
410
			$node->outertext = $text;
411
		}
412
	}
413
414
	/**
415
	 * Gets the inner html of a node
416
	 *
417
	 * @param \DOMNode|object $node
418
	 * @return string
419
	 */
420
	public function getInnerHTML($node)
421
	{
422
		if ($this->internalParser)
423
		{
424
			$doc = new \DOMDocument();
425
			$doc->preserveWhiteSpace = true;
426
			$doc->appendChild($doc->importNode($node, true));
427
			$html = trim($doc->saveHTML());
428
			$tag = $node->nodeName;
429
430
			return preg_replace('@^<' . $tag . '[^>]*>|</' . $tag . '>$@', '', $html);
431
		}
432
433
		return $node->innertext;
0 ignored issues
show
The property innertext does not seem to exist on DOMNode.
Loading history...
434
	}
435
436
	/**
437
	 * Gets the outer html of a node
438
	 *
439
	 * @param \DOMNode|object $node
440
	 * @return string
441
	 */
442
	public function getOuterHTML($node)
443
	{
444
		return $this->internalParser ? htmlspecialchars_decode($this->document->saveHTML($node)) : $node->outertext;
0 ignored issues
show
The property outertext does not seem to exist on DOMNode.
Loading history...
445
	}
446
447
	/**
448
	 * Gets the inner html of a node
449
	 *
450
	 * @param \DOMNode|object $node
451
	 * @return string
452
	 */
453
	public function setInnerHTML($node)
454
	{
455
		if ($this->internalParser)
456
		{
457
			$doc = new \DOMDocument();
458
			$doc->appendChild($doc->importNode($node, true));
459
			$html = trim($doc->saveHTML());
460
			$tag = $node->nodeName;
461
462
			return preg_replace('@^<' . $tag . '[^>]*>|</' . $tag . '>$@', '', $html);
463
		}
464
465
		return $node->innertext;
0 ignored issues
show
The property innertext does not seem to exist on DOMNode.
Loading history...
466
	}
467
}