Completed
Push — master ( d1a6e0...a703a1 )
by Lars
02:13
created

HtmlDomParser::replaceToPreserveHtmlEntities()   B

Complexity

Conditions 6
Paths 6

Size

Total Lines 35
Code Lines 21

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 23
CRAP Score 6

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 35
ccs 23
cts 23
cp 1
rs 8.439
cc 6
eloc 21
nc 6
nop 1
crap 6
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
use BadMethodCallException;
8
use DOMDocument;
9
use DOMXPath;
10
use InvalidArgumentException;
11
use RuntimeException;
12
13
/**
14
 * Class HtmlDomParser
15
 *
16
 * @package voku\helper
17
 *
18
 * @property-read string outerText <p>Get dom node's outer html (alias for "outerHtml").</p>
19
 * @property-read string outerHtml <p>Get dom node's outer html.</p>
20
 * @property-read string innerText <p>Get dom node's inner html (alias for "innerHtml").</p>
21
 * @property-read string innerHtml <p>Get dom node's inner html.</p>
22
 * @property-read string plaintext <p>Get dom node's plain text.</p>
23
 *
24
 * @method string outerText() <p>Get dom node's outer html (alias for "outerHtml()").</p>
25
 * @method string outerHtml() <p>Get dom node's outer html.</p>
26
 * @method string innerText() <p>Get dom node's inner html (alias for "innerHtml()").</p>
27
 *
28
 * @method HtmlDomParser load() load($html) <p>Load HTML from string.</p>
29
 * @method HtmlDomParser load_file() load_file($html) <p>Load HTML from file.</p>
30
 *
31
 * @method static HtmlDomParser file_get_html() file_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from
32
 *         file.</p>
33
 * @method static HtmlDomParser str_get_html() str_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from
34
 *         string.</p>
35
 */
36
class HtmlDomParser
37
{
38
  /**
39
   * @var array
40
   */
41
  protected static $functionAliases = [
42
      'outertext' => 'html',
43
      'outerhtml' => 'html',
44
      'innertext' => 'innerHtml',
45
      'innerhtml' => 'innerHtml',
46
      'load'      => 'loadHtml',
47
      'load_file' => 'loadHtmlFile',
48
  ];
49
50
  /**
51
   * @var string[][]
52
   */
53
  protected static $domLinkReplaceHelper = [
54
      'orig' => ['[', ']', '{', '}',],
55
      'tmp'  => [
56
          '!!!!SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT!!!!',
57
          '!!!!SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT!!!!',
58
          '!!!!SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT!!!!',
59
          '!!!!SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT!!!!',
60
      ],
61
  ];
62
63
  /**
64
   * @var array
65
   */
66
  protected static $domReplaceHelper = [
67
      'orig' => ['&', '|', '+', '%'],
68
      'tmp'  => [
69
          '!!!!SIMPLE_HTML_DOM__VOKU__AMP!!!!',
70
          '!!!!SIMPLE_HTML_DOM__VOKU__PIPE!!!!',
71
          '!!!!SIMPLE_HTML_DOM__VOKU__PLUS!!!!',
72
          '!!!!SIMPLE_HTML_DOM__VOKU__PERCENT!!!!',
73
      ],
74
  ];
75
76
  /**
77
   * @var Callable
78
   */
79
  protected static $callback;
80
81
  /**
82
   * @var DOMDocument
83
   */
84
  protected $document;
85
86
  /**
87
   * @var string
88
   */
89
  protected $encoding = 'UTF-8';
90
91
  /**
92
   * @var bool
93
   */
94
  protected $isDOMDocumentCreatedWithoutHtml = false;
95
96
  /**
97
   * @var bool
98
   */
99
  protected $isDOMDocumentCreatedWithoutWrapper = false;
100
101
  /**
102
   * @var bool
103
   */
104
  protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
105
106
  /**
107
   * @var bool
108
   */
109
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
110
111
  /**
112
   * Constructor
113
   *
114
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
115
   *
116 123
   * @throws \InvalidArgumentException
117
   */
118 123
  public function __construct($element = null)
119
  {
120
    $this->document = new \DOMDocument('1.0', $this->getEncoding());
121 123
122 123
    // DOMDocument settings
123
    $this->document->preserveWhiteSpace = true;
124 123
    $this->document->formatOutput = true;
125 52
126 52
    if ($element instanceof SimpleHtmlDom) {
127
      $element = $element->getNode();
128 123
    }
129 52
130
    if ($element instanceof \DOMNode) {
131 52
      $domNode = $this->document->importNode($element, true);
132 52
133 52
      if ($domNode instanceof \DOMNode) {
134
        $this->document->appendChild($domNode);
135 52
      }
136
137
      return;
138 123
    }
139 73
140 72
    if ($element !== null) {
141 122
      $this->loadHtml($element);
142
    }
143
  }
144
145
  /**
146
   * @param $name
147
   * @param $arguments
148
   *
149 42
   * @return bool|mixed
150
   */
151 42 View Code Duplication
  public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
152
  {
153 42
    $name = \strtolower($name);
154 41
155
    if (isset(self::$functionAliases[$name])) {
156
      return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
157
    }
158 1
159
    /** @noinspection ExceptionsAnnotatingAndHandlingInspection */
160
    throw new BadMethodCallException('Method does not exist: ' . $name);
161
  }
162
163
  /**
164
   * @param $name
165
   * @param $arguments
166
   *
167
   * @return HtmlDomParser
168
   *
169
   * @throws \BadMethodCallException
170
   * @throws \RuntimeException
171 17
   * @throws \InvalidArgumentException
172
   */
173 17
  public static function __callStatic($name, $arguments)
174 17
  {
175 16
    $arguments0 = '';
176 16
    if (isset($arguments[0])) {
177
      $arguments0 = $arguments[0];
178 17
    }
179 17
180 1
    $arguments1 = null;
181 1
    if (isset($arguments[1])) {
182
      $arguments1 = $arguments[1];
183 17
    }
184
185 12
    if ($name === 'str_get_html') {
186
      /** @noinspection ExceptionsAnnotatingAndHandlingInspection */
187 12
      $parser = new self();
188
189
      return $parser->loadHtml($arguments0, $arguments1);
190 5
    }
191
192 4
    if ($name === 'file_get_html') {
193
      /** @noinspection ExceptionsAnnotatingAndHandlingInspection */
194 4
      $parser = new self();
195
196
      return $parser->loadHtmlFile($arguments0, $arguments1);
197 1
    }
198
199
    throw new BadMethodCallException('Method does not exist');
200
  }
201
202
  /** @noinspection MagicMethodsValidityInspection */
203
  /**
204
   * @param $name
205
   *
206 11
   * @return string
207
   */
208 11
  public function __get($name)
209
  {
210
    $name = strtolower($name);
211 11
212 11
    switch ($name) {
213 4
      case 'outerhtml':
214 7
      case 'outertext':
215 7
        return $this->html();
216 5
      case 'innerhtml':
217 2
      case 'innertext':
218 2
        return $this->innerHtml();
219 1
      case 'text':
220
      case 'plaintext':
221
        return $this->text();
222 1
    }
223
224
    return null;
225
  }
226
227
  /**
228
   * @param string $selector
229
   * @param int    $idx
230
   *
231 3
   * @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface
232
   */
233 3
  public function __invoke($selector, $idx = null)
234
  {
235
    return $this->find($selector, $idx);
236
  }
237
238
  /**
239 15
   * @return string
240
   */
241 15
  public function __toString()
242
  {
243
    return $this->html();
244
  }
245
246
  /**
247
   * does nothing (only for api-compatibility-reasons)
248
   *
249
   * @deprecated
250
   *
251
   * @return bool
252
   */
253
  public function clear(): bool
254
  {
255
    return true;
256
  }
257
258
  /**
259
   * @param string $html
260
   *
261 77
   * @return string
262
   */
263
  public static function replaceToPreserveHtmlEntities($html): string
264 77
  {
265 77
    // init
266
    $linksNew = [];
267 77
    $linksOld = [];
268
269
    if (\strpos($html, 'http') !== false) {
270 54
271 54
      // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
272
      $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
273 54
      \preg_match_all($regExUrl, $html, $linksOld);
274 52
275 52
      if (!empty($linksOld[1])) {
276 52
        $linksOld = $linksOld[1];
277 52
        foreach ((array)$linksOld as $linkKey => $linkOld) {
278 52
          $linksNew[$linkKey] = \str_replace(
279
              self::$domLinkReplaceHelper['orig'],
280 52
              self::$domLinkReplaceHelper['tmp'],
281 52
              $linkOld
282 52
          );
283 54
        }
284
      }
285 77
    }
286 77
287 52
    $linksNewCount = \count($linksNew);
288 52
    if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
289 52
      $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
290 28
      $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
291 28
    } else {
292
      $search = self::$domReplaceHelper['orig'];
293
      $replace = self::$domReplaceHelper['tmp'];
294 77
    }
295
296
    return \str_replace($search, $replace, $html);
297
  }
298
299
  /**
300
   * @param string $html
301
   *
302 61
   * @return string
303
   */
304 61
  public static function putReplacedBackToPreserveHtmlEntities($html): string
305
  {
306 61
    static $DOM_REPLACE__HELPER_CACHE = null;
307 1
308 1
    if ($DOM_REPLACE__HELPER_CACHE === null) {
309 1
      $DOM_REPLACE__HELPER_CACHE['tmp'] = array_merge(
310 1
          self::$domLinkReplaceHelper['tmp'],
311 1
          self::$domReplaceHelper['tmp']
312 1
      );
313 1
      $DOM_REPLACE__HELPER_CACHE['orig'] = array_merge(
314 1
          self::$domLinkReplaceHelper['orig'],
315 1
          self::$domReplaceHelper['orig']
316
      );
317 61
    }
318
319
    return str_replace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
320
  }
321
322
  /**
323
   * Create DOMDocument from HTML.
324
   *
325
   * @param string   $html
326
   * @param int|null $libXMLExtraOptions
327
   *
328 111
   * @return \DOMDocument
329
   */
330 111
  private function createDOMDocument($html, $libXMLExtraOptions = null): \DOMDocument
331 6
  {
332 111
    if (\strpos($html, '<') === false) {
333 3
      $this->isDOMDocumentCreatedWithoutHtml = true;
334 3
    } elseif (\strpos(\ltrim($html), '<') !== 0) {
335
      $this->isDOMDocumentCreatedWithoutWrapper = true;
336 111
    }
337 63
338 63
    if (\strpos($html, '<html') === false) {
339
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
340 111
    }
341 65
342 65
    if (\strpos($html, '<head>') === false) {
343
      $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
344
    }
345 111
346 111
    // set error level
347 111
    $internalErrors = \libxml_use_internal_errors(true);
348
    $disableEntityLoader = \libxml_disable_entity_loader(true);
349 111
    \libxml_clear_errors();
350 111
351
    $optionsSimpleXml = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
352 111
    $optionsXml = 0;
353
354
    if (\defined('LIBXML_BIGLINES')) {
355
      $optionsSimpleXml |= LIBXML_BIGLINES;
356 111
    }
357 111
358 111
    if (\defined('LIBXML_COMPACT')) {
359
      $optionsSimpleXml |= LIBXML_COMPACT;
360 111
    }
361
362
    if (\defined('LIBXML_HTML_NOIMPLIED')) {
363
      $optionsSimpleXml |= LIBXML_HTML_NOIMPLIED;
364 111
    }
365
366
    if (\defined('LIBXML_HTML_NODEFDTD')) {
367
      $optionsSimpleXml |= LIBXML_HTML_NODEFDTD;
368 111
    }
369 1
370 1
    if ($libXMLExtraOptions !== null) {
371 1
      $optionsSimpleXml |= $libXMLExtraOptions;
372
      $optionsXml |= $libXMLExtraOptions;
373 111
    }
374 111
375 39
    $sxe = \simplexml_load_string($html, 'SimpleXMLElement', $optionsSimpleXml);
376 39
    if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
377
      $this->document = \dom_import_simplexml($sxe)->ownerDocument;
378
    } else {
379 76
380 76
      // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
381 76
      $html = \trim($html);
382 76
      $xmlHackUsed = false;
383 76
      if (\stripos('<?xml', $html) !== 0) {
384 76
        $xmlHackUsed = true;
385
        $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
386 76
      }
387
388 76
      $html = self::replaceToPreserveHtmlEntities($html);
389
390
      $this->document->loadHTML($html, $optionsXml);
391 76
392
      // remove the "xml-encoding" hack
393
      if ($xmlHackUsed === true) {
394
        foreach ($this->document->childNodes as $child) {
395 76
          if ($child->nodeType === XML_PI_NODE) {
396 76
            $this->document->removeChild($child);
397 76
          }
398 76
        }
399 76
      }
400 76
401 76
      \libxml_clear_errors();
402
    }
403 76
404
    // set encoding
405
    $this->document->encoding = $this->getEncoding();
406
407 111
    // restore lib-xml settings
408
    \libxml_use_internal_errors($internalErrors);
409
    \libxml_disable_entity_loader($disableEntityLoader);
410 111
411 111
    return $this->document;
412
  }
413 111
414
  /**
415
   * Return element by #id.
416
   *
417
   * @param string $id
418
   *
419
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
420
   */
421
  public function getElementById(string $id)
422
  {
423 2
    return $this->find("#$id", 0);
424
  }
425 2
426
  /**
427
   * Return element by tag name.
428
   *
429
   * @param string $name
430
   *
431
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
432
   */
433
  public function getElementByTagName(string $name)
434
  {
435 1
    $node = $this->document->getElementsByTagName($name)->item(0);
436
437 1
    if ($node === null) {
438
      return new SimpleHtmlDomNodeBlank();
439 1
    }
440
441
    return new SimpleHtmlDom($node);
442
  }
443 1
444
  /**
445
   * Returns elements by #id.
446
   *
447
   * @param string   $id
448
   * @param null|int $idx
449
   *
450
   * @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface
451
   */
452
  public function getElementsById(string $id, $idx = null)
453
  {
454
    return $this->find("#$id", $idx);
455
  }
456
457
  /**
458
   * Returns elements by tag name.
459
   *
460
   * @param string   $name
461
   * @param null|int $idx
462
   *
463
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
464
   */
465 View Code Duplication
  public function getElementsByTagName(string $name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
466
  {
467 3
    $nodesList = $this->document->getElementsByTagName($name);
468
469 3
    $elements = new SimpleHtmlDomNode();
470
471 3
    foreach ($nodesList as $node) {
472
      $elements[] = new SimpleHtmlDom($node);
473 3
    }
474 3
475 3
    // return all elements
476
    if (null === $idx) {
477
      return $elements;
478 3
    }
479 2
480
    // handle negative values
481
    if ($idx < 0) {
482
      $idx = \count($elements) + $idx;
483 1
    }
484
485
    // return one element
486
    if (isset($elements[$idx])) {
487
      return $elements[$idx];
488 1
    }
489 1
490
    // return a blank-element
491
    return new SimpleHtmlDomNodeBlank();
492
  }
493
494
  /**
495
   * Find list of nodes with a CSS selector.
496
   *
497
   * @param string $selector
498
   * @param int    $idx
499
   *
500
   * @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface
501
   */
502
  public function find(string $selector, $idx = null)
503
  {
504 81
    /** @noinspection ExceptionsAnnotatingAndHandlingInspection */
505
    $xPathQuery = SelectorConverter::toXPath($selector);
506
507 81
    $xPath = new DOMXPath($this->document);
508
    $nodesList = $xPath->query($xPathQuery);
509 81
    $elements = new SimpleHtmlDomNode();
510 81
511 81
    foreach ($nodesList as $node) {
512
      $elements[] = new SimpleHtmlDom($node);
513 81
    }
514 77
515 81
    // return all elements
516
    if (null === $idx) {
517
      return $elements;
518 81
    }
519 52
520
    // handle negative values
521
    if ($idx < 0) {
522
      $idx = \count($elements) + $idx;
523 41
    }
524 11
525 11
    // return one element
526
    if (isset($elements[$idx])) {
527
      return $elements[$idx];
528 41
    }
529 39
530
    // return a blank-element
531
    return new SimpleHtmlDomNodeBlank();
532
  }
533 5
534
  /**
535
   * @param string $content
536
   * @param bool   $multiDecodeNewHtmlEntity
537
   *
538
   * @return string
539
   */
540
  protected function fixHtmlOutput(string $content, bool $multiDecodeNewHtmlEntity = false): string
541
  {
542 50
    // INFO: DOMDocument will encapsulate plaintext into a paragraph tag (<p>),
543
    //          so we try to remove it here again ...
544
545
    if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
546
      $content = str_replace(
547 50
          [
548 23
              "\n",
549
              "\r\n",
550 23
              "\r",
551 23
              '<body>',
552 23
              '</body>',
553 23
              '<html>',
554 23
              '</html>',
555 23
          ],
556 23
          '',
557 23
          $content
558 23
      );
559
    }
560 23
561 23
    if ($this->isDOMDocumentCreatedWithoutWrapper === true) {
562
      $content = preg_replace('/^<p>/', '', $content);
563 50
      $content = preg_replace('/<\/p>/', '', $content);
564 2
    }
565 2
566 2
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
567
      $content = str_replace(
568 50
          [
569 5
              '<p>',
570
              '</p>',
571 5
              '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
572 5
          ],
573 5
          '',
574 5
          $content
575 5
      );
576
    }
577 5
578 5
    $content = str_replace(
579
        [
580 50
            '<simpleHtmlDomP>',
581
            '</simpleHtmlDomP>',
582 50
            '<head><head>',
583 50
            '</head></head>',
584 50
        ],
585 50
        [
586 50
            '',
587
            '',
588 50
            '<head>',
589 50
            '</head>',
590 50
        ],
591 50
        $content
592 50
    );
593
594 50
    $content = trim($content);
595
    if ($multiDecodeNewHtmlEntity === true) {
596 50
      if (class_exists('\voku\helper\UTF8')) {
597
598 50
        /** @noinspection PhpUndefinedClassInspection */
599 2
        $content = \voku\helper\UTF8::rawurldecode($content);
600 2
601 49
      } else {
602 49
603
        do {
604
          $content_compare = $content;
605 50
606
          $content = \rawurldecode(
607 50
              \html_entity_decode(
608
                  $content,
609
                  ENT_QUOTES | ENT_HTML5
610
              )
611
          );
612
613 37
        } while ($content_compare !== $content);
614
615 37
      }
616
617
    } else {
618
619
      $content = \rawurldecode(
620
          \html_entity_decode(
621
              $content,
622
              ENT_QUOTES | ENT_HTML5
623 123
          )
624
      );
625 123
    }
626
627
    $content = self::putReplacedBackToPreserveHtmlEntities($content);
628
629
    return $content;
630
  }
631 8
632
  /**
633 8
   * @return DOMDocument
634
   */
635
  public function getDocument(): \DOMDocument
636
  {
637
    return $this->document;
638
  }
639 36
640
  /**
641 36
   * Get the encoding to use.
642
   *
643
   * @return string
644
   */
645
  private function getEncoding(): string
646
  {
647 6
    return $this->encoding;
648
  }
649 6
650
  /**
651
   * @return bool
652
   */
653
  public function getIsDOMDocumentCreatedWithoutHtml(): bool
654
  {
655
    return $this->isDOMDocumentCreatedWithoutHtml;
656
  }
657
658
  /**
659
   * @return bool
660
   */
661
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
662
  {
663
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
664
  }
665
666
  /**
667 36
   * @return bool
668
   */
669 36
  public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
670
  {
671
    return $this->isDOMDocumentCreatedWithoutHeadWrapper;
672
  }
673 36
674 17
  /**
675 17
   * @return bool
676 24
   */
677
  public function getIsDOMDocumentCreatedWithoutWrapper(): bool
678
  {
679 36
    return $this->isDOMDocumentCreatedWithoutWrapper;
680
  }
681
682
  /**
683
   * Get dom node's outer html.
684
   *
685
   * @param bool $multiDecodeNewHtmlEntity
686
   *
687
   * @return string
688
   */
689 1
  public function html(bool $multiDecodeNewHtmlEntity = false): string
690
  {
691 1
    if ($this::$callback !== null) {
692
      \call_user_func($this::$callback, [$this]);
693
    }
694 1
695
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
696 1
      $content = $this->document->saveHTML($this->document->documentElement);
697
    } else {
698
      $content = $this->document->saveHTML();
699
    }
700
701
    return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
702
  }
703
704
  /**
705
   * Get the HTML as XML.
706 17
   *
707
   * @param bool $multiDecodeNewHtmlEntity
708 17
   *
709
   * @return string
710 17
   */
711 17
  public function xml(bool $multiDecodeNewHtmlEntity = false): string
712 17
  {
713
    $xml = $this->document->saveXML(null, LIBXML_NOEMPTYTAG);
714 17
715
    // remove the XML-header
716
    $xml = ltrim(preg_replace('/<\?xml.*\?>/', '', $xml));
717
718
    return $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
719
  }
720
721
  /**
722
   * Get dom node's inner html.
723
   *
724
   * @param bool $multiDecodeNewHtmlEntity
725
   *
726
   * @return string
727 114
   */
728
  public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string
729 114
  {
730 3
    $text = '';
731
732
    foreach ($this->document->documentElement->childNodes as $node) {
733 111
      $text .= $this->document->saveHTML($node);
734
    }
735 111
736
    return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
737
  }
738
739
  /**
740
   * Load HTML from string.
741
   *
742
   * @param string   $html
743
   * @param int|null $libXMLExtraOptions
744
   *
745
   * @return HtmlDomParser
746
   *
747
   * @throws InvalidArgumentException if argument is not string
748
   */
749 12
  public function loadHtml(string $html, $libXMLExtraOptions = null): HtmlDomParser
750
  {
751 12
    $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
752 2
753
    return $this;
754
  }
755 10
756 1
  /**
757
   * Load HTML from file.
758
   *
759
   * @param string   $filePath
760 9
   * @param int|null $libXMLExtraOptions
761 9
   *
762 1
   * @return HtmlDomParser
763
   *
764
   * @throws \RuntimeException
765 8
   * @throws \InvalidArgumentException
766
   */
767
  public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): HtmlDomParser
768
  {
769 8
    if (!preg_match("/^https?:\/\//i", $filePath) && !file_exists($filePath)) {
770
      throw new RuntimeException("File $filePath not found");
771 8
    }
772
773
    try {
774
      if (class_exists('\voku\helper\UTF8')) {
775
        /** @noinspection PhpUndefinedClassInspection */
776
        $html = \voku\helper\UTF8::file_get_contents($filePath);
777
      } else {
778
        $html = file_get_contents($filePath);
779
      }
780
    } catch (\Exception $e) {
781 1
      throw new RuntimeException("Could not load file $filePath");
782
    }
783 1
784 1
    if ($html === false) {
785
      throw new RuntimeException("Could not load file $filePath");
786
    }
787
788 1
    $this->loadHtml($html, $libXMLExtraOptions);
789
790
    return $this;
791
  }
792
793
  /**
794
   * Save the html-dom as string.
795
   *
796
   * @param string $filepath
797
   *
798
   * @return string
799
   */
800
  public function save(string $filepath = ''): string
801
  {
802
    $string = $this->innerHtml();
803
    if ($filepath !== '') {
804
      file_put_contents($filepath, $string, LOCK_EX);
805
    }
806 2
807
    return $string;
808 2
  }
809
810
  /**
811
   * @param $functionName
812
   */
813
  public function set_callback($functionName)
814
  {
815
    $this::$callback = $functionName;
816
  }
817
818
  /**
819
   * Get dom node's plain text.
820
   *
821
   * @param bool $multiDecodeNewHtmlEntity
822
   *
823
   * @return string
824
   */
825
  public function text(bool $multiDecodeNewHtmlEntity = false): string
826
  {
827
    return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
828
  }
829
}
830