Completed
Push — master ( 478bc4...8d9f33 )
by Lars
02:20
created

HtmlDomParser::set_callback()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
dl 0
loc 3
ccs 0
cts 3
cp 0
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 1
crap 2
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * Class HtmlDomParser
9
 *
10
 * @package voku\helper
11
 *
12
 * @property-read string outerText <p>Get dom node's outer html (alias for "outerHtml").</p>
13
 * @property-read string outerHtml <p>Get dom node's outer html.</p>
14
 * @property-read string innerText <p>Get dom node's inner html (alias for "innerHtml").</p>
15
 * @property-read string innerHtml <p>Get dom node's inner html.</p>
16
 * @property-read string plaintext <p>Get dom node's plain text.</p>
17
 *
18
 * @method string outerText() <p>Get dom node's outer html (alias for "outerHtml()").</p>
19
 * @method string outerHtml() <p>Get dom node's outer html.</p>
20
 * @method string innerText() <p>Get dom node's inner html (alias for "innerHtml()").</p>
21
 *
22
 * @method HtmlDomParser load() load($html) <p>Load HTML from string.</p>
23
 * @method HtmlDomParser load_file() load_file($html) <p>Load HTML from file.</p>
24
 *
25
 * @method static HtmlDomParser file_get_html() file_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from
26
 *         file.</p>
27
 * @method static HtmlDomParser str_get_html() str_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from
28
 *         string.</p>
29
 */
30
class HtmlDomParser {
31
  /**
32
   * @var array
33
   */
34
  protected static $functionAliases = [
35
    'outertext' => 'html',
36
    'outerhtml' => 'html',
37
    'innertext' => 'innerHtml',
38
    'innerhtml' => 'innerHtml',
39
    'load'      => 'loadHtml',
40
    'load_file' => 'loadHtmlFile',
41
  ];
42
43
  /**
44
   * @var string[][]
45
   */
46
  protected static $domLinkReplaceHelper = [
47
    'orig' => ['[', ']', '{', '}',],
48
    'tmp'  => [
49
      '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT____',
50
      '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT____',
51
      '____SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT____',
52
      '____SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT____',
53
    ],
54
  ];
55
56
  /**
57
   * @var array
58
   */
59
  protected static $domReplaceHelper = [
60
    'orig' => ['&', '|', '+', '%', '@'],
61
    'tmp'  => [
62
      '____SIMPLE_HTML_DOM__VOKU__AMP____',
63
      '____SIMPLE_HTML_DOM__VOKU__PIPE____',
64
      '____SIMPLE_HTML_DOM__VOKU__PLUS____',
65
      '____SIMPLE_HTML_DOM__VOKU__PERCENT____',
66
      '____SIMPLE_HTML_DOM__VOKU__AT____',
67
    ],
68
  ];
69
70
  /**
71
   * @var array
72
   */
73
  protected static $domBrokenReplaceHelper = [];
74
75
  /**
76
   * @var Callable
77
   */
78
  protected static $callback;
79
80
  /**
81
   * @var \DOMDocument
82
   */
83
  protected $document;
84
85
  /**
86
   * @var string
87
   */
88
  protected $encoding = 'UTF-8';
89
90
  /**
91
   * @var bool
92
   */
93
  protected $isDOMDocumentCreatedWithoutHtml = false;
94
95
  /**
96
   * @var bool
97
   */
98
  protected $isDOMDocumentCreatedWithoutWrapper = false;
99
100
  /**
101
   * @var bool
102
   */
103
  protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
104
105
  /**
106
   * @var bool
107
   */
108
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
109
110
  /**
111
   * @var bool
112
   */
113
  protected $keepBrokenHtml;
114
115
  /**
116
   * Constructor
117
   *
118
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
119
   *
120
   * @throws \InvalidArgumentException
121
   */
122 127
  public function __construct($element = null) {
123 127
    $this->document = new \DOMDocument('1.0', $this->getEncoding());
124
125
    // reset
126 127
    self::$domBrokenReplaceHelper = [];
127
128
    // DOMDocument settings
129 127
    $this->document->preserveWhiteSpace = true;
130 127
    $this->document->formatOutput = true;
131
132 127
    if ($element instanceof SimpleHtmlDom) {
133 63
      $element = $element->getNode();
134
    }
135
136 127
    if ($element instanceof \DOMNode) {
137 63
      $domNode = $this->document->importNode($element, true);
138
139 63
      if ($domNode instanceof \DOMNode) {
140 63
        $this->document->appendChild($domNode);
141
      }
142
143 63
      return;
144
    }
145
146 127
    if ($element !== null) {
147 73
      $this->loadHtml($element);
148
    }
149 126
  }
150
151
  /**
152
   * @param $name
153
   * @param $arguments
154
   *
155
   * @return bool|mixed
156
   */
157 45 View Code Duplication
  public function __call($name, $arguments) {
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
158 45
    $name = \strtolower($name);
159
160 45
    if (isset(self::$functionAliases[$name])) {
161 44
      return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
162
    }
163
164 1
    throw new \BadMethodCallException('Method does not exist: ' . $name);
165
  }
166
167
  /**
168
   * @param $name
169
   * @param $arguments
170
   *
171
   * @return HtmlDomParser
172
   *
173
   * @throws \BadMethodCallException
174
   * @throws \RuntimeException
175
   * @throws \InvalidArgumentException
176
   */
177 18
  public static function __callStatic($name, $arguments) {
178 18
    $arguments0 = '';
179 18
    if (isset($arguments[0])) {
180 17
      $arguments0 = $arguments[0];
181
    }
182
183 18
    $arguments1 = null;
184 18
    if (isset($arguments[1])) {
185 1
      $arguments1 = $arguments[1];
186
    }
187
188 18
    if ($name === 'str_get_html') {
189 13
      $parser = new self();
190
191 13
      return $parser->loadHtml($arguments0, $arguments1);
192
    }
193
194 5
    if ($name === 'file_get_html') {
195 4
      $parser = new self();
196
197 4
      return $parser->loadHtmlFile($arguments0, $arguments1);
198
    }
199
200 1
    throw new \BadMethodCallException('Method does not exist');
201
  }
202
203
  /** @noinspection MagicMethodsValidityInspection */
204
  /**
205
   * @param $name
206
   *
207
   * @return string
208
   */
209 13
  public function __get($name) {
210 13
    $name = \strtolower($name);
211
212
    switch ($name) {
213 13
      case 'outerhtml':
214 13
      case 'outertext':
215 4
        return $this->html();
216 9
      case 'innerhtml':
217 3
      case 'innertext':
218 7
        return $this->innerHtml();
219 2
      case 'text':
220 2
      case 'plaintext':
221 1
        return $this->text();
222
    }
223
224 1
    return null;
225
  }
226
227
  /**
228
   * @param string $selector
229
   * @param int    $idx
230
   *
231
   * @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface
232
   */
233 3
  public function __invoke($selector, $idx = null) {
234 3
    return $this->find($selector, $idx);
235
  }
236
237
  /**
238
   * @return string
239
   */
240 15
  public function __toString() {
241 15
    return $this->html();
242
  }
243
244
  /**
245
   * does nothing (only for api-compatibility-reasons)
246
   *
247
   * @deprecated
248
   *
249
   * @return bool
250
   */
251 1
  public function clear(): bool {
252 1
    return true;
253
  }
254
255
  /**
256
   * @param string $html
257
   *
258
   * @return string
259
   */
260 81
  public static function replaceToPreserveHtmlEntities(string $html): string {
261
    // init
262 81
    $linksNew = [];
263 81
    $linksOld = [];
264
265 81
    if (\strpos($html, 'http') !== false) {
266
267
      // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
268 55
      $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
269 55
      \preg_match_all($regExUrl, $html, $linksOld);
270
271 55
      if (!empty($linksOld[1])) {
272 53
        $linksOld = $linksOld[1];
273 53
        foreach ((array)$linksOld as $linkKey => $linkOld) {
274 53
          $linksNew[$linkKey] = \str_replace(
275 53
            self::$domLinkReplaceHelper['orig'],
276 53
            self::$domLinkReplaceHelper['tmp'],
277 53
            $linkOld
278
          );
279
        }
280
      }
281
    }
282
283 81
    $linksNewCount = \count($linksNew);
284 81
    if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
285 53
      $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
286 53
      $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
287
    } else {
288 31
      $search = self::$domReplaceHelper['orig'];
289 31
      $replace = self::$domReplaceHelper['tmp'];
290
    }
291
292 81
    return \str_replace($search, $replace, $html);
293
  }
294
295
  /**
296
   * @param string $html
297
   *
298
   * @return string
299
   */
300 72
  public static function putReplacedBackToPreserveHtmlEntities(string $html): string {
301 72
    static $DOM_REPLACE__HELPER_CACHE = null;
302
303 72
    if ($DOM_REPLACE__HELPER_CACHE === null) {
304 1
      $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
305 1
        self::$domLinkReplaceHelper['tmp'],
306 1
        self::$domReplaceHelper['tmp']
307
      );
308 1
      $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
309 1
        self::$domLinkReplaceHelper['orig'],
310 1
        self::$domReplaceHelper['orig']
311
      );
312
    }
313
314
    if (
315 72
      \count(self::$domBrokenReplaceHelper) > 0
316
      &&
317 72
      \count(self::$domBrokenReplaceHelper['tmp']) > 0
318
    ) {
319 2
      $html = \str_replace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
320
321 2
      $html = \preg_replace('/<____simple_html_dom__voku__broken_html_wrapper____[^>]*>(.*)<\/____simple_html_dom__voku__broken_html_wrapper____[^>]*>/i', '$1', $html);
322
    }
323
324 72
    return \str_replace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
325
  }
326
327
  /**
328
   * Create DOMDocument from HTML.
329
   *
330
   * @param string   $html
331
   * @param int|null $libXMLExtraOptions
332
   *
333
   * @return \DOMDocument
334
   */
335 115
  private function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument {
336 115
    if (\strpos($html, '<') === false) {
337 6
      $this->isDOMDocumentCreatedWithoutHtml = true;
338 114
    } elseif (\strpos(\ltrim($html), '<') !== 0) {
339 3
      $this->isDOMDocumentCreatedWithoutWrapper = true;
340
    }
341
342 115
    if (\strpos($html, '<html') === false) {
343 66
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
344
    }
345
346 115
    if (\strpos($html, '<head>') === false) {
347 68
      $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
348
    }
349
350
    // set error level
351 115
    $internalErrors = \libxml_use_internal_errors(true);
352 115
    $disableEntityLoader = \libxml_disable_entity_loader(true);
353 115
    \libxml_clear_errors();
354
355 115
    $optionsXml = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
356
357 115
    if (\defined('LIBXML_BIGLINES')) {
358 115
      $optionsXml |= LIBXML_BIGLINES;
359
    }
360
361 115
    if (\defined('LIBXML_COMPACT')) {
362 115
      $optionsXml |= LIBXML_COMPACT;
363
    }
364
365 115
    if (\defined('LIBXML_HTML_NODEFDTD')) {
366 115
      $optionsXml |= LIBXML_HTML_NODEFDTD;
367
    }
368
369 115
    if ($libXMLExtraOptions !== null) {
370 1
      $optionsXml |= $libXMLExtraOptions;
371
    }
372
373 115
    $sxe = \simplexml_load_string($html, 'SimpleXMLElement', $optionsXml);
374 115
    if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
375 39
      $this->document = \dom_import_simplexml($sxe)->ownerDocument;
376
    } else {
377
378 80
      $html = \trim($html);
379
380 80
      if ($this->keepBrokenHtml === true) {
381 2
        $html = $this->keepBrokenHtml($html);
382
      }
383
384
      // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
385 80
      $xmlHackUsed = false;
386 80
      if (\stripos('<?xml', $html) !== 0) {
387 80
        $xmlHackUsed = true;
388 80
        $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
389
      }
390
391 80
      $html = self::replaceToPreserveHtmlEntities($html);
392
393 80
      $this->document->loadHTML($html, $optionsXml);
394
395
      // remove the "xml-encoding" hack
396 80
      if ($xmlHackUsed === true) {
397 80
        foreach ($this->document->childNodes as $child) {
398 80
          if ($child->nodeType === XML_PI_NODE) {
399 80
            $this->document->removeChild($child);
400 80
            break;
401
          }
402
        }
403
      }
404
405 80
      \libxml_clear_errors();
406
    }
407
408
    // set encoding
409 115
    $this->document->encoding = $this->getEncoding();
410
411
    // restore lib-xml settings
412 115
    \libxml_use_internal_errors($internalErrors);
413 115
    \libxml_disable_entity_loader($disableEntityLoader);
414
415 115
    return $this->document;
416
  }
417
418
  /**
419
   * @param string $html
420
   *
421
   * @return string
422
   */
423 2
  protected function keepBrokenHtml(string $html): string {
424 2
    $backup = $html;
425
426
    do {
427 2
      $original = $html;
428
429 2
      $html = (string)preg_replace_callback(
430 2
        '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
431 2
        function ($matches) {
432 2
          return $matches['start'] .
433 2
                 '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
434 2
                 $matches['value'] .
435 2
                 '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
436 2
                 $matches['end'];
437 2
        },
438 2
        $html
439
      );
440
441
      // DEBUG
442
      //var_dump($html);
443
444 2
    } while ($original !== $html);
445
446
    do {
447 2
      $original = $html;
448
449 2
      $html = (string)preg_replace_callback(
450 2
        '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
451 2
        function ($matches) {
452
453 2
          $matches['broken'] = str_replace(
454 2
            array('°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'),
455 2
            array('</', '<', '>'),
456 2
            $matches['broken']
457
          );
458
459
          //var_dump($matches['broken']);
460
461 2
          self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
462 2
          self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '____simple_html_dom__voku__broken_html____' . crc32($matches['broken']);
463
464 2
          return $matches['start'] . $matchesHash . $matches['end'];
465 2
        },
466 2
        $html
467
      );
468
469
      // DEBUG
470
      //var_dump($html);
471
472 2
    } while ($original !== $html);
473
474 2
    $html = str_replace(
475 2
      array('°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'),
476 2
      array('</', '<', '>'),
477 2
      $html
478
    );
479
480 2
    if ($backup !== $html) {
481 2
      $html = '<____simple_html_dom__voku__broken_html_wrapper____>' . $html . '</____simple_html_dom__voku__broken_html_wrapper____>';
482
    }
483
484 2
    return $html;
485
  }
486
487
  /**
488
   * Return element by #id.
489
   *
490
   * @param string $id
491
   *
492
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
493
   */
494 2
  public function getElementById(string $id) {
495 2
    return $this->find("#$id", 0);
496
  }
497
498
  /**
499
   * Return element by tag name.
500
   *
501
   * @param string $name
502
   *
503
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
504
   */
505 1
  public function getElementByTagName(string $name) {
506 1
    $node = $this->document->getElementsByTagName($name)->item(0);
507
508 1
    if ($node === null) {
509
      return new SimpleHtmlDomNodeBlank();
510
    }
511
512 1
    return new SimpleHtmlDom($node);
513
  }
514
515
  /**
516
   * Returns elements by #id.
517
   *
518
   * @param string   $id
519
   * @param null|int $idx
520
   *
521
   * @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface
522
   */
523
  public function getElementsById(string $id, $idx = null) {
524
    return $this->find("#$id", $idx);
525
  }
526
527
  /**
528
   * Returns elements by tag name.
529
   *
530
   * @param string   $name
531
   * @param null|int $idx
532
   *
533
   * @return SimpleHtmlDomNode|SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeBlank
534
   */
535 3 View Code Duplication
  public function getElementsByTagName(string $name, $idx = null) {
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
536 3
    $nodesList = $this->document->getElementsByTagName($name);
537
538 3
    $elements = new SimpleHtmlDomNode();
539
540 3
    foreach ($nodesList as $node) {
541 3
      $elements[] = new SimpleHtmlDom($node);
542
    }
543
544
    // return all elements
545 3
    if (null === $idx) {
546 2
      return $elements;
547
    }
548
549
    // handle negative values
550 1
    if ($idx < 0) {
551
      $idx = \count($elements) + $idx;
552
    }
553
554
    // return one element
555 1
    if (isset($elements[$idx])) {
556 1
      return $elements[$idx];
557
    }
558
559
    // return a blank-element
560
    return new SimpleHtmlDomNodeBlank();
561
  }
562
563
  /**
564
   * Find one node with a CSS selector.
565
   *
566
   * @param string $selector
567
   *
568
   * @return SimpleHtmlDom|SimpleHtmlDomNodeInterface
569
   */
570 1
  public function findOne(string $selector) {
571 1
    return $this->find($selector, 0);
572
  }
573
574
  /**
575
   * Find list of nodes with a CSS selector.
576
   *
577
   * @param string $selector
578
   * @param int    $idx
579
   *
580
   * @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface
581
   */
582 83
  public function find(string $selector, $idx = null) {
583 83
    $xPathQuery = SelectorConverter::toXPath($selector);
584
585 83
    $xPath = new \DOMXPath($this->document);
586 83
    $nodesList = $xPath->query($xPathQuery);
587 83
    $elements = new SimpleHtmlDomNode();
588
589 83
    foreach ($nodesList as $node) {
590 79
      $elements[] = new SimpleHtmlDom($node);
591
    }
592
593
    // return all elements
594 83
    if (null === $idx) {
595 54
      return $elements;
596
    }
597
598
    // handle negative values
599 41
    if ($idx < 0) {
600 11
      $idx = \count($elements) + $idx;
601
    }
602
603
    // return one element
604 41
    if (isset($elements[$idx])) {
605 39
      return $elements[$idx];
606
    }
607
608
    // return a blank-element
609 5
    return new SimpleHtmlDomNodeBlank();
610
  }
611
612
  /**
613
   * @param string $content
614
   * @param bool   $multiDecodeNewHtmlEntity
615
   *
616
   * @return string
617
   */
618 63
  public function fixHtmlOutput(string $content, bool $multiDecodeNewHtmlEntity = false): string {
619
    // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
620
    //          so we try to remove it here again ...
621
622 63
    if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
623 25
      $content = \str_replace(
624
        [
625 25
          "\n",
626
          "\r\n",
627
          "\r",
628
          '<body>',
629
          '</body>',
630
          '<html>',
631
          '</html>',
632
        ],
633 25
        '',
634 25
        $content
635
      );
636
    }
637
638 63
    if ($this->isDOMDocumentCreatedWithoutHeadWrapper === true) {
639 26
      $content = \str_replace(
640
        [
641 26
          '<head>',
642
          '</head>',
643
        ],
644 26
        '',
645 26
        $content
646
      );
647
    }
648
649 63
    if ($this->isDOMDocumentCreatedWithoutWrapper === true) {
650 2
      $content = (string)\preg_replace('/^<p>/', '', $content);
651 2
      $content = (string)\preg_replace('/<\/p>/', '', $content);
652
    }
653
654 63
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
655 5
      $content = \str_replace(
656
        [
657 5
          '<p>',
658
          '</p>',
659
          '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
660
        ],
661 5
        '',
662 5
        $content
663
      );
664
    }
665
666
    /** @noinspection CheckTagEmptyBody */
667
    /** @noinspection HtmlExtraClosingTag */
668 63
    $content = \trim(
669 63
      \str_replace(
670
        [
671 63
          '<simpleHtmlDomP>',
672
          '</simpleHtmlDomP>',
673
          '<head><head>',
674
          '</head></head>',
675
          '<br></br>',
676
        ],
677
        [
678 63
          '',
679
          '',
680
          '<head>',
681
          '</head>',
682
          '<br>',
683
        ],
684 63
        $content
685
      )
686
    );
687
688 63
    if ($multiDecodeNewHtmlEntity === true) {
689 2
      if (\class_exists('\voku\helper\UTF8')) {
690
691
        /** @noinspection PhpUndefinedClassInspection */
692
        $content = UTF8::rawurldecode($content);
693
694
      } else {
695
696
        do {
697 2
          $content_compare = $content;
698
699 2
          $content = \rawurldecode(
700 2
            \html_entity_decode(
701 2
              $content,
702 2
              ENT_QUOTES | ENT_HTML5
703
            )
704
          );
705
706 2
        } while ($content_compare !== $content);
707
708
      }
709
710
    } else {
711
712 62
      $content = \rawurldecode(
713 62
        \html_entity_decode(
714 62
          $content,
715 62
          ENT_QUOTES | ENT_HTML5
716
        )
717
      );
718
    }
719
720 63
    $content = self::putReplacedBackToPreserveHtmlEntities($content);
721
722 63
    return $content;
723
  }
724
725
  /**
726
   * @return \DOMDocument
727
   */
728 37
  public function getDocument(): \DOMDocument {
729 37
    return $this->document;
730
  }
731
732
  /**
733
   * Get the encoding to use.
734
   *
735
   * @return string
736
   */
737 127
  private function getEncoding(): string {
738 127
    return $this->encoding;
739
  }
740
741
  /**
742
   * @return bool
743
   */
744 8
  public function getIsDOMDocumentCreatedWithoutHtml(): bool {
745 8
    return $this->isDOMDocumentCreatedWithoutHtml;
746
  }
747
748
  /**
749
   * @return bool
750
   */
751 37
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool {
752 37
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
753
  }
754
755
  /**
756
   * @return bool
757
   */
758 6
  public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool {
759 6
    return $this->isDOMDocumentCreatedWithoutHeadWrapper;
760
  }
761
762
  /**
763
   * @return bool
764
   */
765
  public function getIsDOMDocumentCreatedWithoutWrapper(): bool {
766
    return $this->isDOMDocumentCreatedWithoutWrapper;
767
  }
768
769
  /**
770
   * Get dom node's outer html.
771
   *
772
   * @param bool $multiDecodeNewHtmlEntity
773
   *
774
   * @return string
775
   */
776 37
  public function html(bool $multiDecodeNewHtmlEntity = false): string {
777 37
    if ($this::$callback !== null) {
778
      \call_user_func($this::$callback, [$this]);
779
    }
780
781 37
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
782 17
      $content = $this->document->saveHTML($this->document->documentElement);
783
    } else {
784 25
      $content = $this->document->saveHTML();
785
    }
786
787 37
    return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
788
  }
789
790
  /**
791
   * @param bool $keepBrokenHtml
792
   *
793
   * @return HtmlDomParser
794
   */
795 2
  public function useKeepBrokenHtml(bool $keepBrokenHtml): HtmlDomParser {
796 2
    $this->keepBrokenHtml = $keepBrokenHtml;
797
798 2
    return $this;
799
  }
800
801
  /**
802
   * Get the HTML as XML.
803
   *
804
   * @param bool $multiDecodeNewHtmlEntity
805
   *
806
   * @return string
807
   */
808 1
  public function xml(bool $multiDecodeNewHtmlEntity = false): string {
809 1
    $xml = $this->document->saveXML(null, \LIBXML_NOEMPTYTAG);
810
811
    // remove the XML-header
812 1
    $xml = \ltrim((string)\preg_replace('/<\?xml.*\?>/', '', $xml));
813
814 1
    return $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
815
  }
816
817
  /**
818
   * Get dom node's inner html.
819
   *
820
   * @param bool $multiDecodeNewHtmlEntity
821
   *
822
   * @return string
823
   */
824 19
  public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string {
825
    // init
826 19
    $text = '';
827
828 19
    foreach ($this->document->documentElement->childNodes as $node) {
829 19
      $text .= $this->document->saveHTML($node);
830
    }
831
832 19
    return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
833
  }
834
835
  /**
836
   * Load HTML from string.
837
   *
838
   * @param string   $html
839
   * @param int|null $libXMLExtraOptions
840
   *
841
   * @return HtmlDomParser
842
   *
843
   * @throws \InvalidArgumentException if argument is not string
844
   */
845 115
  public function loadHtml(string $html, $libXMLExtraOptions = null): self {
846 115
    $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
847
848 115
    return $this;
849
  }
850
851
  /**
852
   * Load HTML from file.
853
   *
854
   * @param string   $filePath
855
   * @param int|null $libXMLExtraOptions
856
   *
857
   * @return HtmlDomParser
858
   *
859
   * @throws \RuntimeException
860
   * @throws \InvalidArgumentException
861
   */
862 11
  public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): self {
863
    if (
864 11
      !\preg_match("/^https?:\/\//i", $filePath)
865
      &&
866 11
      !\file_exists($filePath)
867
    ) {
868 1
      throw new \RuntimeException("File $filePath not found");
869
    }
870
871
    try {
872 10
      if (\class_exists('\voku\helper\UTF8')) {
873
        /** @noinspection PhpUndefinedClassInspection */
874
        $html = UTF8::file_get_contents($filePath);
875
      } else {
876 10
        $html = \file_get_contents($filePath);
877
      }
878 1
    } catch (\Exception $e) {
879 1
      throw new \RuntimeException("Could not load file $filePath");
880
    }
881
882 9
    if ($html === false) {
883
      throw new \RuntimeException("Could not load file $filePath");
884
    }
885
886 9
    $this->loadHtml($html, $libXMLExtraOptions);
887
888 9
    return $this;
889
  }
890
891
  /**
892
   * Save the html-dom as string.
893
   *
894
   * @param string $filepath
895
   *
896
   * @return string
897
   */
898 1
  public function save(string $filepath = ''): string {
899 1
    $string = $this->innerHtml();
900 1
    if ($filepath !== '') {
901
      \file_put_contents($filepath, $string, LOCK_EX);
902
    }
903
904 1
    return $string;
905
  }
906
907
  /**
908
   * @param $functionName
909
   */
910
  public function set_callback($functionName) {
911
    $this::$callback = $functionName;
912
  }
913
914
  /**
915
   * Get dom node's plain text.
916
   *
917
   * @param bool $multiDecodeNewHtmlEntity
918
   *
919
   * @return string
920
   */
921 2
  public function text(bool $multiDecodeNewHtmlEntity = false): string {
922 2
    return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
923
  }
924
925
  public function __clone() {
926
    $this->document = clone $this->document;
927
  }
928
}
929