Completed
Push — master ( de33f2...651825 )
by Lars
01:40
created

putReplacedBackToPreserveHtmlEntities()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 17
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 10
CRAP Score 2

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 17
ccs 10
cts 10
cp 1
rs 9.4285
cc 2
eloc 10
nc 2
nop 1
crap 2
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
use BadMethodCallException;
8
use DOMDocument;
9
use DOMXPath;
10
use InvalidArgumentException;
11
use RuntimeException;
12
13
/**
14
 * Class HtmlDomParser
15
 *
16
 * @package voku\helper
17
 *
18
 * @property-read string outerText <p>Get dom node's outer html (alias for "outerHtml").</p>
19
 * @property-read string outerHtml <p>Get dom node's outer html.</p>
20
 * @property-read string innerText <p>Get dom node's inner html (alias for "innerHtml").</p>
21
 * @property-read string innerHtml <p>Get dom node's inner html.</p>
22
 * @property-read string plaintext <p>Get dom node's plain text.</p>
23
 *
24
 * @method string outerText() <p>Get dom node's outer html (alias for "outerHtml()").</p>
25
 * @method string outerHtml() <p>Get dom node's outer html.</p>
26
 * @method string innerText() <p>Get dom node's inner html (alias for "innerHtml()").</p>
27
 *
28
 * @method HtmlDomParser load() load($html) <p>Load HTML from string.</p>
29
 * @method HtmlDomParser load_file() load_file($html) <p>Load HTML from file.</p>
30
 *
31
 * @method static HtmlDomParser file_get_html() file_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from
32
 *         file.</p>
33
 * @method static HtmlDomParser str_get_html() str_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from
34
 *         string.</p>
35
 */
36
class HtmlDomParser
37
{
38
  /**
39
   * @var array
40
   */
41
  protected static $functionAliases = [
42
      'outertext' => 'html',
43
      'outerhtml' => 'html',
44
      'innertext' => 'innerHtml',
45
      'innerhtml' => 'innerHtml',
46
      'load'      => 'loadHtml',
47
      'load_file' => 'loadHtmlFile',
48
  ];
49
50
  /**
51
   * @var string[][]
52
   */
53
  protected static $domLinkReplaceHelper = [
54
      'orig' => ['[', ']', '{', '}',],
55
      'tmp'  => [
56
          '!!!!SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT!!!!',
57
          '!!!!SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT!!!!',
58
          '!!!!SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT!!!!',
59
          '!!!!SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT!!!!',
60
      ],
61
  ];
62
63
  /**
64
   * @var array
65
   */
66
  protected static $domReplaceHelper = [
67
      'orig' => ['&', '|', '+', '%'],
68
      'tmp'  => [
69
          '!!!!SIMPLE_HTML_DOM__VOKU__AMP!!!!',
70
          '!!!!SIMPLE_HTML_DOM__VOKU__PIPE!!!!',
71
          '!!!!SIMPLE_HTML_DOM__VOKU__PLUS!!!!',
72
          '!!!!SIMPLE_HTML_DOM__VOKU__PERCENT!!!!',
73
      ],
74
  ];
75
76
  /**
77
   * @var Callable
78
   */
79
  protected static $callback;
80
81
  /**
82
   * @var DOMDocument
83
   */
84
  protected $document;
85
86
  /**
87
   * @var string
88
   */
89
  protected $encoding = 'UTF-8';
90
91
  /**
92
   * @var bool
93
   */
94
  protected $isDOMDocumentCreatedWithoutHtml = false;
95
96
  /**
97
   * @var bool
98
   */
99
  protected $isDOMDocumentCreatedWithoutWrapper = false;
100
101
  /**
102
   * @var bool
103
   */
104
  protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
105
106
  /**
107
   * @var bool
108
   */
109
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
110
111
  /**
112
   * Constructor
113
   *
114
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
115
   *
116
   * @throws \InvalidArgumentException
117
   */
118 123
  public function __construct($element = null)
119
  {
120 123
    $this->document = new \DOMDocument('1.0', $this->getEncoding());
121
122
    // DOMDocument settings
123 123
    $this->document->preserveWhiteSpace = true;
124 123
    $this->document->formatOutput = true;
125
126 123
    if ($element instanceof SimpleHtmlDom) {
127 52
      $element = $element->getNode();
128
    }
129
130 123
    if ($element instanceof \DOMNode) {
131 52
      $domNode = $this->document->importNode($element, true);
132
133 52
      if ($domNode instanceof \DOMNode) {
134 52
        $this->document->appendChild($domNode);
135
      }
136
137 52
      return;
138
    }
139
140 123
    if ($element !== null) {
141 73
      $this->loadHtml($element);
142
    }
143 122
  }
144
145
  /**
146
   * @param $name
147
   * @param $arguments
148
   *
149
   * @return bool|mixed
150
   */
151 42 View Code Duplication
  public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
152
  {
153 42
    $name = \strtolower($name);
154
155 42
    if (isset(self::$functionAliases[$name])) {
156 41
      return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
157
    }
158
159
    /** @noinspection ExceptionsAnnotatingAndHandlingInspection */
160 1
    throw new BadMethodCallException('Method does not exist: ' . $name);
161
  }
162
163
  /**
164
   * @param $name
165
   * @param $arguments
166
   *
167
   * @return HtmlDomParser
168
   *
169
   * @throws \BadMethodCallException
170
   * @throws \RuntimeException
171
   * @throws \InvalidArgumentException
172
   */
173 17
  public static function __callStatic($name, $arguments)
174
  {
175 17
    $arguments0 = '';
176 17
    if (isset($arguments[0])) {
177 16
      $arguments0 = $arguments[0];
178
    }
179
180 17
    $arguments1 = null;
181 17
    if (isset($arguments[1])) {
182 1
      $arguments1 = $arguments[1];
183
    }
184
185 17
    if ($name === 'str_get_html') {
186
      /** @noinspection ExceptionsAnnotatingAndHandlingInspection */
187 12
      $parser = new self();
188
189 12
      return $parser->loadHtml($arguments0, $arguments1);
190
    }
191
192 5
    if ($name === 'file_get_html') {
193
      /** @noinspection ExceptionsAnnotatingAndHandlingInspection */
194 4
      $parser = new self();
195
196 4
      return $parser->loadHtmlFile($arguments0, $arguments1);
197
    }
198
199 1
    throw new BadMethodCallException('Method does not exist');
200
  }
201
202
  /** @noinspection MagicMethodsValidityInspection */
203
  /**
204
   * @param $name
205
   *
206
   * @return string
207
   */
208 11
  public function __get($name)
209
  {
210 11
    $name = \strtolower($name);
211
212 11
    switch ($name) {
213
      case 'outerhtml':
214
      case 'outertext':
215 4
        return $this->html();
216
      case 'innerhtml':
217
      case 'innertext':
218 5
        return $this->innerHtml();
219
      case 'text':
220
      case 'plaintext':
221 1
        return $this->text();
222
    }
223
224 1
    return null;
225
  }
226
227
  /**
228
   * @param string $selector
229
   * @param int    $idx
230
   *
231
   * @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface
232
   */
233 3
  public function __invoke($selector, $idx = null)
234
  {
235 3
    return $this->find($selector, $idx);
236
  }
237
238
  /**
239
   * @return string
240
   */
241 15
  public function __toString()
242
  {
243 15
    return $this->html();
244
  }
245
246
  /**
247
   * does nothing (only for api-compatibility-reasons)
248
   *
249
   * @deprecated
250
   *
251
   * @return bool
252
   */
253 1
  public function clear(): bool
254
  {
255 1
    return true;
256
  }
257
258
  /**
259
   * @param string $html
260
   *
261
   * @return string
262
   */
263 78
  public static function replaceToPreserveHtmlEntities(string $html): string
264
  {
265
    // init
266 78
    $linksNew = [];
267 78
    $linksOld = [];
268
269 78
    if (\strpos($html, 'http') !== false) {
270
271
      // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
272 54
      $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
273 54
      \preg_match_all($regExUrl, $html, $linksOld);
274
275 54
      if (!empty($linksOld[1])) {
276 52
        $linksOld = $linksOld[1];
277 52
        foreach ((array)$linksOld as $linkKey => $linkOld) {
278 52
          $linksNew[$linkKey] = \str_replace(
279 52
              self::$domLinkReplaceHelper['orig'],
280 52
              self::$domLinkReplaceHelper['tmp'],
281 52
              $linkOld
282
          );
283
        }
284
      }
285
    }
286
287 78
    $linksNewCount = \count($linksNew);
288 78
    if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
289 52
      $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
290 52
      $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
291
    } else {
292 29
      $search = self::$domReplaceHelper['orig'];
293 29
      $replace = self::$domReplaceHelper['tmp'];
294
    }
295
296 78
    return \str_replace($search, $replace, $html);
297
  }
298
299
  /**
300
   * @param string $html
301
   *
302
   * @return string
303
   */
304 61
  public static function putReplacedBackToPreserveHtmlEntities(string $html): string
305
  {
306 61
    static $DOM_REPLACE__HELPER_CACHE = null;
307
308 61
    if ($DOM_REPLACE__HELPER_CACHE === null) {
309 1
      $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
310 1
          self::$domLinkReplaceHelper['tmp'],
311 1
          self::$domReplaceHelper['tmp']
312
      );
313 1
      $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
314 1
          self::$domLinkReplaceHelper['orig'],
315 1
          self::$domReplaceHelper['orig']
316
      );
317
    }
318
319 61
    return \str_replace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
320
  }
321
322
  /**
323
   * Create DOMDocument from HTML.
324
   *
325
   * @param string   $html
326
   * @param int|null $libXMLExtraOptions
327
   *
328
   * @return \DOMDocument
329
   */
330 111
  private function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
331
  {
332 111
    if (\strpos($html, '<') === false) {
333 6
      $this->isDOMDocumentCreatedWithoutHtml = true;
334 110
    } elseif (\strpos(\ltrim($html), '<') !== 0) {
335 3
      $this->isDOMDocumentCreatedWithoutWrapper = true;
336
    }
337
338 111
    if (\strpos($html, '<html') === false) {
339 63
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
340
    }
341
342 111
    if (\strpos($html, '<head>') === false) {
343 65
      $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
344
    }
345
346
    // set error level
347 111
    $internalErrors = \libxml_use_internal_errors(true);
348 111
    $disableEntityLoader = \libxml_disable_entity_loader(true);
349 111
    \libxml_clear_errors();
350
351 111
    $optionsSimpleXml = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
352 111
    $optionsXml = 0;
353
354 111
    if (\defined('LIBXML_BIGLINES')) {
355 111
      $optionsSimpleXml |= LIBXML_BIGLINES;
356
    }
357
358 111
    if (\defined('LIBXML_COMPACT')) {
359 111
      $optionsSimpleXml |= LIBXML_COMPACT;
360
    }
361
362 111
    if (\defined('LIBXML_HTML_NOIMPLIED')) {
363 111
      $optionsSimpleXml |= LIBXML_HTML_NOIMPLIED;
364
    }
365
366 111
    if (\defined('LIBXML_HTML_NODEFDTD')) {
367 111
      $optionsSimpleXml |= LIBXML_HTML_NODEFDTD;
368
    }
369
370 111
    if ($libXMLExtraOptions !== null) {
371 1
      $optionsSimpleXml |= $libXMLExtraOptions;
372 1
      $optionsXml |= $libXMLExtraOptions;
373
    }
374
375 111
    $sxe = \simplexml_load_string($html, 'SimpleXMLElement', $optionsSimpleXml);
376 111
    if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
377 38
      $this->document = \dom_import_simplexml($sxe)->ownerDocument;
378
    } else {
379
380
      // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
381 77
      $html = \trim($html);
382 77
      $xmlHackUsed = false;
383 77
      if (\stripos('<?xml', $html) !== 0) {
384 77
        $xmlHackUsed = true;
385 77
        $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
386
      }
387
388 77
      $html = self::replaceToPreserveHtmlEntities($html);
389
390 77
      $this->document->loadHTML($html, $optionsXml);
391
392
      // remove the "xml-encoding" hack
393 77
      if ($xmlHackUsed === true) {
394 77
        foreach ($this->document->childNodes as $child) {
395 77
          if ($child->nodeType === XML_PI_NODE) {
396 77
            $this->document->removeChild($child);
397
          }
398
        }
399
      }
400
401 77
      \libxml_clear_errors();
402
    }
403
404
    // set encoding
405 111
    $this->document->encoding = $this->getEncoding();
406
407
    // restore lib-xml settings
408 111
    \libxml_use_internal_errors($internalErrors);
409 111
    \libxml_disable_entity_loader($disableEntityLoader);
410
411 111
    return $this->document;
412
  }
413
414
  /**
415
   * Return element by #id.
416
   *
417
   * @param string $id
418
   *
419
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
420
   */
421 2
  public function getElementById(string $id)
422
  {
423 2
    return $this->find("#$id", 0);
424
  }
425
426
  /**
427
   * Return element by tag name.
428
   *
429
   * @param string $name
430
   *
431
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
432
   */
433 1
  public function getElementByTagName(string $name)
434
  {
435 1
    $node = $this->document->getElementsByTagName($name)->item(0);
436
437 1
    if ($node === null) {
438
      return new SimpleHtmlDomNodeBlank();
439
    }
440
441 1
    return new SimpleHtmlDom($node);
442
  }
443
444
  /**
445
   * Returns elements by #id.
446
   *
447
   * @param string   $id
448
   * @param null|int $idx
449
   *
450
   * @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface
451
   */
452
  public function getElementsById(string $id, $idx = null)
453
  {
454
    return $this->find("#$id", $idx);
455
  }
456
457
  /**
458
   * Returns elements by tag name.
459
   *
460
   * @param string   $name
461
   * @param null|int $idx
462
   *
463
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
464
   */
465 3 View Code Duplication
  public function getElementsByTagName(string $name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
466
  {
467 3
    $nodesList = $this->document->getElementsByTagName($name);
468
469 3
    $elements = new SimpleHtmlDomNode();
470
471 3
    foreach ($nodesList as $node) {
472 3
      $elements[] = new SimpleHtmlDom($node);
473
    }
474
475
    // return all elements
476 3
    if (null === $idx) {
477 2
      return $elements;
478
    }
479
480
    // handle negative values
481 1
    if ($idx < 0) {
482
      $idx = \count($elements) + $idx;
483
    }
484
485
    // return one element
486 1
    if (isset($elements[$idx])) {
487 1
      return $elements[$idx];
488
    }
489
490
    // return a blank-element
491
    return new SimpleHtmlDomNodeBlank();
492
  }
493
494
  /**
495
   * Find one node with a CSS selector.
496
   *
497
   * @param string $selector
498
   *
499
   * @return SimpleHtmlDom|SimpleHtmlDomNodeInterface
500
   */
501 1
  public function findOne(string $selector)
502
  {
503 1
    return $this->find($selector, 0);
504
  }
505
506
  /**
507
   * Find list of nodes with a CSS selector.
508
   *
509
   * @param string $selector
510
   * @param int    $idx
511
   *
512
   * @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface
513
   */
514 81
  public function find(string $selector, $idx = null)
515
  {
516
    /** @noinspection ExceptionsAnnotatingAndHandlingInspection */
517 81
    $xPathQuery = SelectorConverter::toXPath($selector);
518
519 81
    $xPath = new DOMXPath($this->document);
520 81
    $nodesList = $xPath->query($xPathQuery);
521 81
    $elements = new SimpleHtmlDomNode();
522
523 81
    foreach ($nodesList as $node) {
524 77
      $elements[] = new SimpleHtmlDom($node);
525
    }
526
527
    // return all elements
528 81
    if (null === $idx) {
529 52
      return $elements;
530
    }
531
532
    // handle negative values
533 41
    if ($idx < 0) {
534 11
      $idx = \count($elements) + $idx;
535
    }
536
537
    // return one element
538 41
    if (isset($elements[$idx])) {
539 39
      return $elements[$idx];
540
    }
541
542
    // return a blank-element
543 5
    return new SimpleHtmlDomNodeBlank();
544
  }
545
546
  /**
547
   * @param string $content
548
   * @param bool   $multiDecodeNewHtmlEntity
549
   *
550
   * @return string
551
   */
552 50
  public function fixHtmlOutput(string $content, bool $multiDecodeNewHtmlEntity = false): string
553
  {
554
    // INFO: DOMDocument will encapsulate plaintext into a paragraph tag (<p>),
555
    //          so we try to remove it here again ...
556
557 50
    if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
558 23
      $content = \str_replace(
559
          [
560 23
              "\n",
561
              "\r\n",
562
              "\r",
563
              '<body>',
564
              '</body>',
565
              '<html>',
566
              '</html>',
567
          ],
568 23
          '',
569 23
          $content
570
      );
571
    }
572
573 50
    if ($this->isDOMDocumentCreatedWithoutWrapper === true) {
574 2
      $content = (string)\preg_replace('/^<p>/', '', $content);
575 2
      $content = (string)\preg_replace('/<\/p>/', '', $content);
576
    }
577
578 50
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
579 5
      $content = \str_replace(
580
          [
581 5
              '<p>',
582
              '</p>',
583
              '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
584
          ],
585 5
          '',
586 5
          $content
587
      );
588
    }
589
590 50
    $content = \str_replace(
591
        [
592 50
            '<simpleHtmlDomP>',
593
            '</simpleHtmlDomP>',
594
            '<head><head>',
595
            '</head></head>',
596
            '<br></br>',
597
        ],
598
        [
599 50
            '',
600
            '',
601
            '<head>',
602
            '</head>',
603
            '<br>',
604
        ],
605 50
        $content
606
    );
607
608 50
    $content = \trim($content);
609 50
    if ($multiDecodeNewHtmlEntity === true) {
610 2
      if (\class_exists('\voku\helper\UTF8')) {
611
612
        /** @noinspection PhpUndefinedClassInspection */
613
        $content = \voku\helper\UTF8::rawurldecode($content);
614
615
      } else {
616
617
        do {
618 2
          $content_compare = $content;
619
620 2
          $content = \rawurldecode(
621 2
              \html_entity_decode(
622 2
                  $content,
623 2
                  ENT_QUOTES | ENT_HTML5
624
              )
625
          );
626
627 2
        } while ($content_compare !== $content);
628
629
      }
630
631
    } else {
632
633 49
      $content = \rawurldecode(
634 49
          \html_entity_decode(
635 49
              $content,
636 49
              ENT_QUOTES | ENT_HTML5
637
          )
638
      );
639
    }
640
641 50
    $content = self::putReplacedBackToPreserveHtmlEntities($content);
642
643 50
    return $content;
644
  }
645
646
  /**
647
   * @return DOMDocument
648
   */
649 37
  public function getDocument(): \DOMDocument
650
  {
651 37
    return $this->document;
652
  }
653
654
  /**
655
   * Get the encoding to use.
656
   *
657
   * @return string
658
   */
659 123
  private function getEncoding(): string
660
  {
661 123
    return $this->encoding;
662
  }
663
664
  /**
665
   * @return bool
666
   */
667 8
  public function getIsDOMDocumentCreatedWithoutHtml(): bool
668
  {
669 8
    return $this->isDOMDocumentCreatedWithoutHtml;
670
  }
671
672
  /**
673
   * @return bool
674
   */
675 36
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
676
  {
677 36
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
678
  }
679
680
  /**
681
   * @return bool
682
   */
683 6
  public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
684
  {
685 6
    return $this->isDOMDocumentCreatedWithoutHeadWrapper;
686
  }
687
688
  /**
689
   * @return bool
690
   */
691
  public function getIsDOMDocumentCreatedWithoutWrapper(): bool
692
  {
693
    return $this->isDOMDocumentCreatedWithoutWrapper;
694
  }
695
696
  /**
697
   * Get dom node's outer html.
698
   *
699
   * @param bool $multiDecodeNewHtmlEntity
700
   *
701
   * @return string
702
   */
703 36
  public function html(bool $multiDecodeNewHtmlEntity = false): string
704
  {
705 36
    if ($this::$callback !== null) {
706
      \call_user_func($this::$callback, [$this]);
707
    }
708
709 36
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
710 17
      $content = $this->document->saveHTML($this->document->documentElement);
711
    } else {
712 24
      $content = $this->document->saveHTML();
713
    }
714
715 36
    return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
716
  }
717
718
  /**
719
   * Get the HTML as XML.
720
   *
721
   * @param bool $multiDecodeNewHtmlEntity
722
   *
723
   * @return string
724
   */
725 1
  public function xml(bool $multiDecodeNewHtmlEntity = false): string
726
  {
727 1
    $xml = $this->document->saveXML(null, LIBXML_NOEMPTYTAG);
728
729
    // remove the XML-header
730 1
    $xml = \ltrim((string)\preg_replace('/<\?xml.*\?>/', '', $xml));
731
732 1
    return $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
733
  }
734
735
  /**
736
   * Get dom node's inner html.
737
   *
738
   * @param bool $multiDecodeNewHtmlEntity
739
   *
740
   * @return string
741
   */
742 17
  public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string
743
  {
744 17
    $text = '';
745
746 17
    foreach ($this->document->documentElement->childNodes as $node) {
747 17
      $text .= $this->document->saveHTML($node);
748
    }
749
750 17
    return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
751
  }
752
753
  /**
754
   * Load HTML from string.
755
   *
756
   * @param string   $html
757
   * @param int|null $libXMLExtraOptions
758
   *
759
   * @return HtmlDomParser
760
   *
761
   * @throws InvalidArgumentException if argument is not string
762
   */
763 111
  public function loadHtml(string $html, $libXMLExtraOptions = null): self
764
  {
765 111
    $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
766
767 111
    return $this;
768
  }
769
770
  /**
771
   * Load HTML from file.
772
   *
773
   * @param string   $filePath
774
   * @param int|null $libXMLExtraOptions
775
   *
776
   * @return HtmlDomParser
777
   *
778
   * @throws \RuntimeException
779
   * @throws \InvalidArgumentException
780
   */
781 10
  public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): self
782
  {
783
    if (
784 10
        !\preg_match("/^https?:\/\//i", $filePath)
785
        &&
786 10
        !\file_exists($filePath)
787
    ) {
788 1
      throw new RuntimeException("File $filePath not found");
789
    }
790
791
    try {
792 9
      if (\class_exists('\voku\helper\UTF8')) {
793
        /** @noinspection PhpUndefinedClassInspection */
794
        $html = \voku\helper\UTF8::file_get_contents($filePath);
795
      } else {
796 9
        $html = \file_get_contents($filePath);
797
      }
798 1
    } catch (\Exception $e) {
799 1
      throw new RuntimeException("Could not load file $filePath");
800
    }
801
802 8
    if ($html === false) {
803
      throw new RuntimeException("Could not load file $filePath");
804
    }
805
806 8
    $this->loadHtml($html, $libXMLExtraOptions);
807
808 8
    return $this;
809
  }
810
811
  /**
812
   * Save the html-dom as string.
813
   *
814
   * @param string $filepath
815
   *
816
   * @return string
817
   */
818 1
  public function save(string $filepath = ''): string
819
  {
820 1
    $string = $this->innerHtml();
821 1
    if ($filepath !== '') {
822
      \file_put_contents($filepath, $string, LOCK_EX);
823
    }
824
825 1
    return $string;
826
  }
827
828
  /**
829
   * @param $functionName
830
   */
831
  public function set_callback($functionName)
832
  {
833
    $this::$callback = $functionName;
834
  }
835
836
  /**
837
   * Get dom node's plain text.
838
   *
839
   * @param bool $multiDecodeNewHtmlEntity
840
   *
841
   * @return string
842
   */
843 2
  public function text(bool $multiDecodeNewHtmlEntity = false): string
844
  {
845 2
    return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
846
  }
847
848
  public function __clone()
849
  {
850
    $this->document = clone $this->document;
851
  }
852
}
853