Completed
Push — master ( 468717...935663 )
by Lars
04:43
created

HtmlDomParser::loadHtml()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 6
ccs 3
cts 3
cp 1
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 2
crap 1
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
/**
8
 * Class HtmlDomParser
9
 *
10
 * @package voku\helper
11
 *
12
 * @property-read string outerText <p>Get dom node's outer html (alias for "outerHtml").</p>
13
 * @property-read string outerHtml <p>Get dom node's outer html.</p>
14
 * @property-read string innerText <p>Get dom node's inner html (alias for "innerHtml").</p>
15
 * @property-read string innerHtml <p>Get dom node's inner html.</p>
16
 * @property-read string plaintext <p>Get dom node's plain text.</p>
17
 *
18
 * @method string outerText() <p>Get dom node's outer html (alias for "outerHtml()").</p>
19
 * @method string outerHtml() <p>Get dom node's outer html.</p>
20
 * @method string innerText() <p>Get dom node's inner html (alias for "innerHtml()").</p>
21
 *
22
 * @method HtmlDomParser load() load($html) <p>Load HTML from string.</p>
23
 * @method HtmlDomParser load_file() load_file($html) <p>Load HTML from file.</p>
24
 *
25
 * @method static HtmlDomParser file_get_html() file_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from
26
 *         file.</p>
27
 * @method static HtmlDomParser str_get_html() str_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from
28
 *         string.</p>
29
 */
30
class HtmlDomParser
31
{
32
  /**
33
   * @var array
34
   */
35
  protected static $functionAliases = [
36
      'outertext' => 'html',
37
      'outerhtml' => 'html',
38
      'innertext' => 'innerHtml',
39
      'innerhtml' => 'innerHtml',
40
      'load'      => 'loadHtml',
41
      'load_file' => 'loadHtmlFile',
42
  ];
43
44
  /**
45
   * @var string[][]
46
   */
47
  protected static $domLinkReplaceHelper = [
48
      'orig' => ['[', ']', '{', '}',],
49
      'tmp'  => [
50
          '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT____',
51
          '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT____',
52
          '____SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT____',
53
          '____SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT____',
54
      ],
55
  ];
56
57
  /**
58
   * @var array
59
   */
60
  protected static $domReplaceHelper = [
61
      'orig' => ['&', '|', '+', '%', '@'],
62
      'tmp'  => [
63
          '____SIMPLE_HTML_DOM__VOKU__AMP____',
64
          '____SIMPLE_HTML_DOM__VOKU__PIPE____',
65
          '____SIMPLE_HTML_DOM__VOKU__PLUS____',
66
          '____SIMPLE_HTML_DOM__VOKU__PERCENT____',
67
          '____SIMPLE_HTML_DOM__VOKU__AT____',
68
      ],
69
  ];
70
71
  protected static $domHtmlWrapperHelper = '____simple_html_dom__voku__html_wrapper____';
72
73
  /**
74
   * @var array
75
   */
76
  protected static $domBrokenReplaceHelper = [];
77
78
  /**
79
   * @var Callable
80
   */
81
  protected static $callback;
82
83
  /**
84
   * @var \DOMDocument
85
   */
86
  protected $document;
87
88
  /**
89
   * @var string
90
   */
91
  protected $encoding = 'UTF-8';
92
93
  /**
94
   * @var bool
95
   */
96
  protected $isDOMDocumentCreatedWithoutHtml = false;
97
98
  /**
99
   * @var bool
100
   */
101
  protected $isDOMDocumentCreatedWithoutWrapper = false;
102
103
  /**
104
   * @var bool
105
   */
106
  protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
107
108
  /**
109
   * @var bool
110
   */
111
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
112
113
  /**
114
   * @var bool
115
   */
116
  protected $keepBrokenHtml;
117
118
  /**
119
   * Constructor
120
   *
121
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
122
   *
123
   * @throws \InvalidArgumentException
124
   */
125 128
  public function __construct($element = null)
126
  {
127 128
    $this->document = new \DOMDocument('1.0', $this->getEncoding());
128
129
    // reset
130 128
    self::$domBrokenReplaceHelper = [];
131
132
    // DOMDocument settings
133 128
    $this->document->preserveWhiteSpace = true;
134 128
    $this->document->formatOutput = true;
135
136 128
    if ($element instanceof SimpleHtmlDom) {
137 63
      $element = $element->getNode();
138
    }
139
140 128
    if ($element instanceof \DOMNode) {
141 63
      $domNode = $this->document->importNode($element, true);
142
143 63
      if ($domNode instanceof \DOMNode) {
144 63
        $this->document->appendChild($domNode);
145
      }
146
147 63
      return;
148
    }
149
150 128
    if ($element !== null) {
151 73
      $this->loadHtml($element);
152
    }
153 127
  }
154
155
  /**
156
   * @param $name
157
   * @param $arguments
158
   *
159
   * @return bool|mixed
160
   */
161 46 View Code Duplication
  public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
162
  {
163 46
    $name = \strtolower($name);
164
165 46
    if (isset(self::$functionAliases[$name])) {
166 45
      return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
167
    }
168
169 1
    throw new \BadMethodCallException('Method does not exist: ' . $name);
170
  }
171
172
  /**
173
   * @param $name
174
   * @param $arguments
175
   *
176
   * @return HtmlDomParser
177
   *
178
   * @throws \BadMethodCallException
179
   * @throws \RuntimeException
180
   * @throws \InvalidArgumentException
181
   */
182 18
  public static function __callStatic($name, $arguments)
183
  {
184 18
    $arguments0 = '';
185 18
    if (isset($arguments[0])) {
186 17
      $arguments0 = $arguments[0];
187
    }
188
189 18
    $arguments1 = null;
190 18
    if (isset($arguments[1])) {
191 1
      $arguments1 = $arguments[1];
192
    }
193
194 18
    if ($name === 'str_get_html') {
195 13
      $parser = new self();
196
197 13
      return $parser->loadHtml($arguments0, $arguments1);
198
    }
199
200 5
    if ($name === 'file_get_html') {
201 4
      $parser = new self();
202
203 4
      return $parser->loadHtmlFile($arguments0, $arguments1);
204
    }
205
206 1
    throw new \BadMethodCallException('Method does not exist');
207
  }
208
209
  /** @noinspection MagicMethodsValidityInspection */
210
  /**
211
   * @param $name
212
   *
213
   * @return string
214
   */
215 13
  public function __get($name)
216
  {
217 13
    $name = \strtolower($name);
218
219 13
    switch ($name) {
220 13
      case 'outerhtml':
221 13
      case 'outertext':
222 4
        return $this->html();
223 9
      case 'innerhtml':
224 3
      case 'innertext':
225 7
        return $this->innerHtml();
226 2
      case 'text':
227 2
      case 'plaintext':
228 1
        return $this->text();
229
    }
230
231 1
    return null;
232
  }
233
234
  /**
235
   * @param string $selector
236
   * @param int    $idx
237
   *
238
   * @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface
239
   */
240 3
  public function __invoke($selector, $idx = null)
241
  {
242 3
    return $this->find($selector, $idx);
243
  }
244
245
  /**
246
   * @return string
247
   */
248 15
  public function __toString()
249
  {
250 15
    return $this->html();
251
  }
252
253
  /**
254
   * does nothing (only for api-compatibility-reasons)
255
   *
256
   * @deprecated
257
   *
258
   * @return bool
259
   */
260 1
  public function clear(): bool
261
  {
262 1
    return true;
263
  }
264
265
  /**
266
   * @param string $html
267
   *
268
   * @return string
269
   */
270 117
  public static function replaceToPreserveHtmlEntities(string $html): string
271
  {
272
    // init
273 117
    $linksNew = [];
274 117
    $linksOld = [];
275
276 117
    if (\strpos($html, 'http') !== false) {
277
278
      // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
279 55
      $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
280 55
      \preg_match_all($regExUrl, $html, $linksOld);
281
282 55
      if (!empty($linksOld[1])) {
283 53
        $linksOld = $linksOld[1];
284 53
        foreach ((array)$linksOld as $linkKey => $linkOld) {
285 53
          $linksNew[$linkKey] = \str_replace(
286 53
              self::$domLinkReplaceHelper['orig'],
287 53
              self::$domLinkReplaceHelper['tmp'],
288 53
              $linkOld
289
          );
290
        }
291
      }
292
    }
293
294 117
    $linksNewCount = \count($linksNew);
295 117
    if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
296 53
      $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
297 53
      $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
298
    } else {
299 68
      $search = self::$domReplaceHelper['orig'];
300 68
      $replace = self::$domReplaceHelper['tmp'];
301
    }
302
303 117
    return \str_replace($search, $replace, $html);
304
  }
305
306
  /**
307
   * @param string $html
308
   *
309
   * @return string
310
   */
311 73
  public static function putReplacedBackToPreserveHtmlEntities(string $html): string
312
  {
313 73
    static $DOM_REPLACE__HELPER_CACHE = null;
314
315 73
    if ($DOM_REPLACE__HELPER_CACHE === null) {
316 1
      $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
317 1
          self::$domLinkReplaceHelper['tmp'],
318 1
          self::$domReplaceHelper['tmp']
319
      );
320 1
      $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
321 1
          self::$domLinkReplaceHelper['orig'],
322 1
          self::$domReplaceHelper['orig']
323
      );
324
325 1
      $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__start'] = '<' . self::$domHtmlWrapperHelper . '>';
326 1
      $DOM_REPLACE__HELPER_CACHE['tmp']['html_wrapper__end'] = '</' . self::$domHtmlWrapperHelper . '>';
327
328 1
      $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__start'] = '';
329 1
      $DOM_REPLACE__HELPER_CACHE['orig']['html_wrapper__end'] = '';
330
    }
331
332
    if (
333 73
        isset(self::$domBrokenReplaceHelper['tmp'])
334
        &&
335 73
        \count(self::$domBrokenReplaceHelper['tmp']) > 0
336
    ) {
337 2
      $html = \str_replace(self::$domBrokenReplaceHelper['tmp'], self::$domBrokenReplaceHelper['orig'], $html);
338
    }
339
340 73
    return \str_replace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
341
  }
342
343
  /**
344
   * Create DOMDocument from HTML.
345
   *
346
   * @param string   $html
347
   * @param int|null $libXMLExtraOptions
348
   *
349
   * @return \DOMDocument
350
   */
351 116
  private function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
352
  {
353 116
    if ($this->keepBrokenHtml === true) {
354 2
      $html = $this->keepBrokenHtml(trim($html));
355
    }
356
357 116
    if (\strpos($html, '<') === false) {
358 6
      $this->isDOMDocumentCreatedWithoutHtml = true;
359 115
    } elseif (\strpos(\ltrim($html), '<') !== 0) {
360 4
      $this->isDOMDocumentCreatedWithoutWrapper = true;
361
    }
362
363 116
    if (\strpos($html, '<html') === false) {
364 67
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
365
    }
366
367 116
    if (\strpos($html, '<head>') === false) {
368 69
      $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
369
    }
370
371
    // set error level
372 116
    $internalErrors = \libxml_use_internal_errors(true);
373 116
    $disableEntityLoader = \libxml_disable_entity_loader(true);
374 116
    \libxml_clear_errors();
375
376 116
    $optionsXml = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
377
378 116
    if (\defined('LIBXML_BIGLINES')) {
379 116
      $optionsXml |= LIBXML_BIGLINES;
380
    }
381
382 116
    if (\defined('LIBXML_COMPACT')) {
383 116
      $optionsXml |= LIBXML_COMPACT;
384
    }
385
386 116
    if (\defined('LIBXML_HTML_NODEFDTD')) {
387 116
      $optionsXml |= LIBXML_HTML_NODEFDTD;
388
    }
389
390 116
    if ($libXMLExtraOptions !== null) {
391 1
      $optionsXml |= $libXMLExtraOptions;
392
    }
393
394
    if (
395 116
        $this->isDOMDocumentCreatedWithoutWrapper === true
396
        ||
397 116
        $this->keepBrokenHtml === true
398
    ) {
399 5
      $html = '<' . self::$domHtmlWrapperHelper . '>' . $html . '</' . self::$domHtmlWrapperHelper . '>';
400
    }
401
402 116
    $html = self::replaceToPreserveHtmlEntities($html);
403
404 116
    $sxe = \simplexml_load_string($html, \SimpleXMLElement::class, $optionsXml);
405 116
    if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
406
407 41
      $this->document = \dom_import_simplexml($sxe)->ownerDocument;
408
409
    } else {
410
411
      // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
412 79
      $xmlHackUsed = false;
413 79
      if (\stripos('<?xml', $html) !== 0) {
414 79
        $xmlHackUsed = true;
415 79
        $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
416
      }
417
418 79
      $this->document->loadHTML($html, $optionsXml);
419
420
      // remove the "xml-encoding" hack
421 79
      if ($xmlHackUsed === true) {
422 79
        foreach ($this->document->childNodes as $child) {
423 79
          if ($child->nodeType === XML_PI_NODE) {
424 79
            $this->document->removeChild($child);
425 79
            break;
426
          }
427
        }
428
      }
429
    }
430
431
    // set encoding
432 116
    $this->document->encoding = $this->getEncoding();
433
434
    // restore lib-xml settings
435 116
    \libxml_clear_errors();
436 116
    \libxml_use_internal_errors($internalErrors);
437 116
    \libxml_disable_entity_loader($disableEntityLoader);
438
439 116
    return $this->document;
440
  }
441
442
  /**
443
   * @param string $html
444
   *
445
   * @return string
446
   */
447 2
  protected function keepBrokenHtml(string $html): string
448
  {
449
    do {
450 2
      $original = $html;
451
452 2
      $html = (string)preg_replace_callback(
453 2
          '/(?<start>.*)<(?<element_start>[a-z]+)(?<element_start_addon> [^>]*)?>(?<value>.*?)<\/(?<element_end>\2)>(?<end>.*)/sui',
454 2
          function ($matches) {
455 2
            return $matches['start'] .
456 2
                   '°lt_simple_html_dom__voku_°' . $matches['element_start'] . $matches['element_start_addon'] . '°gt_simple_html_dom__voku_°' .
457 2
                   $matches['value'] .
458 2
                   '°lt/_simple_html_dom__voku_°' . $matches['element_end'] . '°gt_simple_html_dom__voku_°' .
459 2
                   $matches['end'];
460 2
          },
461 2
          $html
462
      );
463
464 2
    } while ($original !== $html);
465
466
    do {
467 2
      $original = $html;
468
469 2
      $html = (string)preg_replace_callback(
470 2
          '/(?<start>[^<]*)?(?<broken>(?:(?:<\/\w+(?:\s+\w+=\\"[^\"]+\\")*+)(?:[^<]+)>)+)(?<end>.*)/u',
471 2
          function ($matches) {
472
473 2
            $matches['broken'] = str_replace(
474 2
                ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
475 2
                ['</', '<', '>'],
476 2
                $matches['broken']
477
            );
478
479 2
            self::$domBrokenReplaceHelper['orig'][] = $matches['broken'];
480 2
            self::$domBrokenReplaceHelper['tmp'][] = $matchesHash = '____simple_html_dom__voku__broken_html____' . crc32($matches['broken']);
481
482 2
            return $matches['start'] . $matchesHash . $matches['end'];
483 2
          },
484 2
          $html
485
      );
486
487 2
    } while ($original !== $html);
488
489 2
    $html = str_replace(
490 2
        ['°lt/_simple_html_dom__voku_°', '°lt_simple_html_dom__voku_°', '°gt_simple_html_dom__voku_°'],
491 2
        ['</', '<', '>'],
492 2
        $html
493
    );
494
495 2
    return $html;
496
  }
497
498
  /**
499
   * Return element by #id.
500
   *
501
   * @param string $id
502
   *
503
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
504
   */
505 2
  public function getElementById(string $id)
506
  {
507 2
    return $this->find("#$id", 0);
508
  }
509
510
  /**
511
   * Return element by tag name.
512
   *
513
   * @param string $name
514
   *
515
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
516
   */
517 1
  public function getElementByTagName(string $name)
518
  {
519 1
    $node = $this->document->getElementsByTagName($name)->item(0);
520
521 1
    if ($node === null) {
522
      return new SimpleHtmlDomNodeBlank();
523
    }
524
525 1
    return new SimpleHtmlDom($node);
526
  }
527
528
  /**
529
   * Returns elements by #id.
530
   *
531
   * @param string   $id
532
   * @param null|int $idx
533
   *
534
   * @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface
535
   */
536
  public function getElementsById(string $id, $idx = null)
537
  {
538
    return $this->find("#$id", $idx);
539
  }
540
541
  /**
542
   * Returns elements by tag name.
543
   *
544
   * @param string   $name
545
   * @param null|int $idx
546
   *
547
   * @return SimpleHtmlDomNode|SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeBlank
548
   */
549 3 View Code Duplication
  public function getElementsByTagName(string $name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
550
  {
551 3
    $nodesList = $this->document->getElementsByTagName($name);
552
553 3
    $elements = new SimpleHtmlDomNode();
554
555 3
    foreach ($nodesList as $node) {
556 3
      $elements[] = new SimpleHtmlDom($node);
557
    }
558
559
    // return all elements
560 3
    if (null === $idx) {
561 2
      return $elements;
562
    }
563
564
    // handle negative values
565 1
    if ($idx < 0) {
566
      $idx = \count($elements) + $idx;
567
    }
568
569
    // return one element
570 1
    if (isset($elements[$idx])) {
571 1
      return $elements[$idx];
572
    }
573
574
    // return a blank-element
575
    return new SimpleHtmlDomNodeBlank();
576
  }
577
578
  /**
579
   * Find one node with a CSS selector.
580
   *
581
   * @param string $selector
582
   *
583
   * @return SimpleHtmlDom|SimpleHtmlDomNodeInterface
584
   */
585 1
  public function findOne(string $selector)
586
  {
587 1
    return $this->find($selector, 0);
588
  }
589
590
  /**
591
   * Find list of nodes with a CSS selector.
592
   *
593
   * @param string $selector
594
   * @param int    $idx
595
   *
596
   * @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface
597
   */
598 83
  public function find(string $selector, $idx = null)
599
  {
600 83
    $xPathQuery = SelectorConverter::toXPath($selector);
601
602 83
    $xPath = new \DOMXPath($this->document);
603 83
    $nodesList = $xPath->query($xPathQuery);
604 83
    $elements = new SimpleHtmlDomNode();
605
606 83
    foreach ($nodesList as $node) {
607 79
      $elements[] = new SimpleHtmlDom($node);
608
    }
609
610
    // return all elements
611 83
    if (null === $idx) {
612 54
      return $elements;
613
    }
614
615
    // handle negative values
616 41
    if ($idx < 0) {
617 11
      $idx = \count($elements) + $idx;
618
    }
619
620
    // return one element
621 41
    if (isset($elements[$idx])) {
622 39
      return $elements[$idx];
623
    }
624
625
    // return a blank-element
626 5
    return new SimpleHtmlDomNodeBlank();
627
  }
628
629
  /**
630
   * @param string $content
631
   * @param bool   $multiDecodeNewHtmlEntity
632
   *
633
   * @return string
634
   */
635 64
  public function fixHtmlOutput(string $content, bool $multiDecodeNewHtmlEntity = false): string
636
  {
637
    // INFO: DOMDocument will encapsulate plaintext into a e.g. paragraph tag (<p>),
638
    //          so we try to remove it here again ...
639
640 64
    if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
641 26
      $content = \str_replace(
642
          [
643 26
              "\n",
644
              "\r\n",
645
              "\r",
646
              '<body>',
647
              '</body>',
648
              '<html>',
649
              '</html>',
650
          ],
651 26
          '',
652 26
          $content
653
      );
654
    }
655
656 64
    if ($this->isDOMDocumentCreatedWithoutHeadWrapper === true) {
657 27
      $content = \str_replace(
658
          [
659 27
              '<head>',
660
              '</head>',
661
          ],
662 27
          '',
663 27
          $content
664
      );
665
    }
666
667 64
    if ($this->isDOMDocumentCreatedWithoutWrapper === true) {
668 3
      $content = (string)\preg_replace('/^<p>/', '', $content);
669 3
      $content = (string)\preg_replace('/<\/p>/', '', $content);
670
    }
671
672 64
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
673 5
      $content = \str_replace(
674
          [
675 5
              '<p>',
676
              '</p>',
677
              '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
678
          ],
679 5
          '',
680 5
          $content
681
      );
682
    }
683
684
    /** @noinspection CheckTagEmptyBody */
685
    /** @noinspection HtmlExtraClosingTag */
686 64
    $content = \trim(
687 64
        \str_replace(
688
            [
689 64
                '<simpleHtmlDomP>',
690
                '</simpleHtmlDomP>',
691
                '<head><head>',
692
                '</head></head>',
693
                '<br></br>',
694
            ],
695
            [
696 64
                '',
697
                '',
698
                '<head>',
699
                '</head>',
700
                '<br>',
701
            ],
702 64
            $content
703
        )
704
    );
705
706 64
    if ($multiDecodeNewHtmlEntity === true) {
707 3
      if (\class_exists('\voku\helper\UTF8')) {
708
709
        /** @noinspection PhpUndefinedClassInspection */
710
        $content = UTF8::rawurldecode($content);
711
712
      } else {
713
714
        do {
715 3
          $content_compare = $content;
716
717 3
          $content = \rawurldecode(
718 3
              \html_entity_decode(
719 3
                  $content,
720 3
                  ENT_QUOTES | ENT_HTML5
721
              )
722
          );
723
724 3
        } while ($content_compare !== $content);
725
726
      }
727
728
    } else {
729
730 63
      $content = \rawurldecode(
731 63
          \html_entity_decode(
732 63
              $content,
733 63
              ENT_QUOTES | ENT_HTML5
734
          )
735
      );
736
    }
737
738 64
    $content = self::putReplacedBackToPreserveHtmlEntities($content);
739
740 64
    return $content;
741
  }
742
743
  /**
744
   * @return \DOMDocument
745
   */
746 37
  public function getDocument(): \DOMDocument
747
  {
748 37
    return $this->document;
749
  }
750
751
  /**
752
   * Get the encoding to use.
753
   *
754
   * @return string
755
   */
756 128
  private function getEncoding(): string
757
  {
758 128
    return $this->encoding;
759
  }
760
761
  /**
762
   * @return bool
763
   */
764 8
  public function getIsDOMDocumentCreatedWithoutHtml(): bool
765
  {
766 8
    return $this->isDOMDocumentCreatedWithoutHtml;
767
  }
768
769
  /**
770
   * @return bool
771
   */
772 39
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
773
  {
774 39
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
775
  }
776
777
  /**
778
   * @return bool
779
   */
780 6
  public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
781
  {
782 6
    return $this->isDOMDocumentCreatedWithoutHeadWrapper;
783
  }
784
785
  /**
786
   * @return bool
787
   */
788
  public function getIsDOMDocumentCreatedWithoutWrapper(): bool
789
  {
790
    return $this->isDOMDocumentCreatedWithoutWrapper;
791
  }
792
793
  /**
794
   * Get dom node's outer html.
795
   *
796
   * @param bool $multiDecodeNewHtmlEntity
797
   *
798
   * @return string
799
   */
800 39
  public function html(bool $multiDecodeNewHtmlEntity = false): string
801
  {
802 39
    if ($this::$callback !== null) {
803
      \call_user_func($this::$callback, [$this]);
804
    }
805
806 39
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
807 19
      $content = $this->document->saveHTML($this->document->documentElement);
808
    } else {
809 25
      $content = $this->document->saveHTML();
810
    }
811
812 39
    return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
813
  }
814
815
  /**
816
   * @param bool $keepBrokenHtml
817
   *
818
   * @return HtmlDomParser
819
   */
820 2
  public function useKeepBrokenHtml(bool $keepBrokenHtml): self
821
  {
822 2
    $this->keepBrokenHtml = $keepBrokenHtml;
823
824 2
    return $this;
825
  }
826
827
  /**
828
   * Get the HTML as XML.
829
   *
830
   * @param bool $multiDecodeNewHtmlEntity
831
   *
832
   * @return string
833
   */
834 2
  public function xml(bool $multiDecodeNewHtmlEntity = false): string
835
  {
836 2
    $xml = $this->document->saveXML(null, \LIBXML_NOEMPTYTAG);
837
838
    // remove the XML-header
839 2
    $xml = \ltrim((string)\preg_replace('/<\?xml.*\?>/', '', $xml));
840
841 2
    return $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
842
  }
843
844
  /**
845
   * Get dom node's inner html.
846
   *
847
   * @param bool $multiDecodeNewHtmlEntity
848
   *
849
   * @return string
850
   */
851 19
  public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string
852
  {
853
    // init
854 19
    $text = '';
855
856 19
    foreach ($this->document->documentElement->childNodes as $node) {
857 19
      $text .= $this->document->saveHTML($node);
858
    }
859
860 19
    return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
861
  }
862
863
  /**
864
   * Load HTML from string.
865
   *
866
   * @param string   $html
867
   * @param int|null $libXMLExtraOptions
868
   *
869
   * @return HtmlDomParser
870
   *
871
   * @throws \InvalidArgumentException if argument is not string
872
   */
873 116
  public function loadHtml(string $html, $libXMLExtraOptions = null): self
874
  {
875 116
    $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
876
877 116
    return $this;
878
  }
879
880
  /**
881
   * Load HTML from file.
882
   *
883
   * @param string   $filePath
884
   * @param int|null $libXMLExtraOptions
885
   *
886
   * @return HtmlDomParser
887
   *
888
   * @throws \RuntimeException
889
   * @throws \InvalidArgumentException
890
   */
891 11
  public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): self
892
  {
893
    if (
894 11
        !\preg_match("/^https?:\/\//i", $filePath)
895
        &&
896 11
        !\file_exists($filePath)
897
    ) {
898 1
      throw new \RuntimeException("File $filePath not found");
899
    }
900
901
    try {
902 10
      if (\class_exists('\voku\helper\UTF8')) {
903
        /** @noinspection PhpUndefinedClassInspection */
904
        $html = UTF8::file_get_contents($filePath);
905
      } else {
906 10
        $html = \file_get_contents($filePath);
907
      }
908 1
    } catch (\Exception $e) {
909 1
      throw new \RuntimeException("Could not load file $filePath");
910
    }
911
912 9
    if ($html === false) {
913
      throw new \RuntimeException("Could not load file $filePath");
914
    }
915
916 9
    $this->loadHtml($html, $libXMLExtraOptions);
917
918 9
    return $this;
919
  }
920
921
  /**
922
   * Save the html-dom as string.
923
   *
924
   * @param string $filepath
925
   *
926
   * @return string
927
   */
928 1
  public function save(string $filepath = ''): string
929
  {
930 1
    $string = $this->innerHtml();
931 1
    if ($filepath !== '') {
932
      \file_put_contents($filepath, $string, LOCK_EX);
933
    }
934
935 1
    return $string;
936
  }
937
938
  /**
939
   * @param $functionName
940
   */
941
  public function set_callback($functionName)
942
  {
943
    $this::$callback = $functionName;
944
  }
945
946
  /**
947
   * Get dom node's plain text.
948
   *
949
   * @param bool $multiDecodeNewHtmlEntity
950
   *
951
   * @return string
952
   */
953 2
  public function text(bool $multiDecodeNewHtmlEntity = false): string
954
  {
955 2
    return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
956
  }
957
958
  public function __clone()
959
  {
960
    $this->document = clone $this->document;
961
  }
962
}
963