Completed
Push — master ( b8fd30...907d46 )
by Lars
01:39
created

HtmlDomParser::__toString()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 4
ccs 2
cts 2
cp 1
rs 10
cc 1
eloc 2
nc 1
nop 0
crap 1
1
<?php
2
3
declare(strict_types=1);
4
5
namespace voku\helper;
6
7
use BadMethodCallException;
8
use DOMDocument;
9
use DOMXPath;
10
use InvalidArgumentException;
11
use RuntimeException;
12
13
/**
14
 * Class HtmlDomParser
15
 *
16
 * @package voku\helper
17
 *
18
 * @property-read string outerText <p>Get dom node's outer html (alias for "outerHtml").</p>
19
 * @property-read string outerHtml <p>Get dom node's outer html.</p>
20
 * @property-read string innerText <p>Get dom node's inner html (alias for "innerHtml").</p>
21
 * @property-read string innerHtml <p>Get dom node's inner html.</p>
22
 * @property-read string plaintext <p>Get dom node's plain text.</p>
23
 *
24
 * @method string outerText() <p>Get dom node's outer html (alias for "outerHtml()").</p>
25
 * @method string outerHtml() <p>Get dom node's outer html.</p>
26
 * @method string innerText() <p>Get dom node's inner html (alias for "innerHtml()").</p>
27
 *
28
 * @method HtmlDomParser load() load($html) <p>Load HTML from string.</p>
29
 * @method HtmlDomParser load_file() load_file($html) <p>Load HTML from file.</p>
30
 *
31
 * @method static HtmlDomParser file_get_html() file_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from
32
 *         file.</p>
33
 * @method static HtmlDomParser str_get_html() str_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from
34
 *         string.</p>
35
 */
36
class HtmlDomParser
37
{
38
  /**
39
   * @var array
40
   */
41
  protected static $functionAliases = [
42
      'outertext' => 'html',
43
      'outerhtml' => 'html',
44
      'innertext' => 'innerHtml',
45
      'innerhtml' => 'innerHtml',
46
      'load'      => 'loadHtml',
47
      'load_file' => 'loadHtmlFile',
48
  ];
49
50
  /**
51
   * @var string[][]
52
   */
53
  protected static $domLinkReplaceHelper = [
54
      'orig' => ['[', ']', '{', '}',],
55
      'tmp'  => [
56
          '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT____',
57
          '____SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT____',
58
          '____SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT____',
59
          '____SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT____',
60
      ],
61
  ];
62
63
  /**
64
   * @var array
65
   */
66
  protected static $domReplaceHelper = [
67
      'orig' => ['&', '|', '+', '%', '@'],
68
      'tmp'  => [
69
          '____SIMPLE_HTML_DOM__VOKU__AMP____',
70
          '____SIMPLE_HTML_DOM__VOKU__PIPE____',
71
          '____SIMPLE_HTML_DOM__VOKU__PLUS____',
72
          '____SIMPLE_HTML_DOM__VOKU__PERCENT____',
73
          '____SIMPLE_HTML_DOM__VOKU__AT____',
74
      ],
75
  ];
76
77
  /**
78
   * @var Callable
79
   */
80
  protected static $callback;
81
82
  /**
83
   * @var DOMDocument
84
   */
85
  protected $document;
86
87
  /**
88
   * @var string
89
   */
90
  protected $encoding = 'UTF-8';
91
92
  /**
93
   * @var bool
94
   */
95
  protected $isDOMDocumentCreatedWithoutHtml = false;
96
97
  /**
98
   * @var bool
99
   */
100
  protected $isDOMDocumentCreatedWithoutWrapper = false;
101
102
  /**
103
   * @var bool
104
   */
105
  protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
106
107
  /**
108
   * @var bool
109
   */
110
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
111
112
  /**
113
   * Constructor
114
   *
115
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
116
   *
117
   * @throws \InvalidArgumentException
118
   */
119 124
  public function __construct($element = null)
120
  {
121 124
    $this->document = new \DOMDocument('1.0', $this->getEncoding());
122
123
    // DOMDocument settings
124 124
    $this->document->preserveWhiteSpace = true;
125 124
    $this->document->formatOutput = true;
126
127 124
    if ($element instanceof SimpleHtmlDom) {
128 62
      $element = $element->getNode();
129
    }
130
131 124
    if ($element instanceof \DOMNode) {
132 62
      $domNode = $this->document->importNode($element, true);
133
134 62
      if ($domNode instanceof \DOMNode) {
135 62
        $this->document->appendChild($domNode);
136
      }
137
138 62
      return;
139
    }
140
141 124
    if ($element !== null) {
142 73
      $this->loadHtml($element);
143
    }
144 123
  }
145
146
  /**
147
   * @param $name
148
   * @param $arguments
149
   *
150
   * @return bool|mixed
151
   */
152 42 View Code Duplication
  public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
153
  {
154 42
    $name = \strtolower($name);
155
156 42
    if (isset(self::$functionAliases[$name])) {
157 41
      return \call_user_func_array([$this, self::$functionAliases[$name]], $arguments);
158
    }
159
160
    /** @noinspection ExceptionsAnnotatingAndHandlingInspection */
161 1
    throw new BadMethodCallException('Method does not exist: ' . $name);
162
  }
163
164
  /**
165
   * @param $name
166
   * @param $arguments
167
   *
168
   * @return HtmlDomParser
169
   *
170
   * @throws \BadMethodCallException
171
   * @throws \RuntimeException
172
   * @throws \InvalidArgumentException
173
   */
174 17
  public static function __callStatic($name, $arguments)
175
  {
176 17
    $arguments0 = '';
177 17
    if (isset($arguments[0])) {
178 16
      $arguments0 = $arguments[0];
179
    }
180
181 17
    $arguments1 = null;
182 17
    if (isset($arguments[1])) {
183 1
      $arguments1 = $arguments[1];
184
    }
185
186 17
    if ($name === 'str_get_html') {
187
      /** @noinspection ExceptionsAnnotatingAndHandlingInspection */
188 12
      $parser = new self();
189
190 12
      return $parser->loadHtml($arguments0, $arguments1);
191
    }
192
193 5
    if ($name === 'file_get_html') {
194
      /** @noinspection ExceptionsAnnotatingAndHandlingInspection */
195 4
      $parser = new self();
196
197 4
      return $parser->loadHtmlFile($arguments0, $arguments1);
198
    }
199
200 1
    throw new BadMethodCallException('Method does not exist');
201
  }
202
203
  /** @noinspection MagicMethodsValidityInspection */
204
  /**
205
   * @param $name
206
   *
207
   * @return string
208
   */
209 11
  public function __get($name)
210
  {
211 11
    $name = \strtolower($name);
212
213
    switch ($name) {
214 11
      case 'outerhtml':
215 11
      case 'outertext':
216 4
        return $this->html();
217 7
      case 'innerhtml':
218 3
      case 'innertext':
219 5
        return $this->innerHtml();
220 2
      case 'text':
221 2
      case 'plaintext':
222 1
        return $this->text();
223
    }
224
225 1
    return null;
226
  }
227
228
  /**
229
   * @param string $selector
230
   * @param int    $idx
231
   *
232
   * @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface
233
   */
234 3
  public function __invoke($selector, $idx = null)
235
  {
236 3
    return $this->find($selector, $idx);
237
  }
238
239
  /**
240
   * @return string
241
   */
242 15
  public function __toString()
243
  {
244 15
    return $this->html();
245
  }
246
247
  /**
248
   * does nothing (only for api-compatibility-reasons)
249
   *
250
   * @deprecated
251
   *
252
   * @return bool
253
   */
254 1
  public function clear(): bool
255
  {
256 1
    return true;
257
  }
258
259
  /**
260
   * @param string $html
261
   *
262
   * @return string
263
   */
264 79
  public static function replaceToPreserveHtmlEntities(string $html): string
265
  {
266
    // init
267 79
    $linksNew = [];
268 79
    $linksOld = [];
269
270 79
    if (\strpos($html, 'http') !== false) {
271
272
      // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
273 55
      $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
274 55
      \preg_match_all($regExUrl, $html, $linksOld);
275
276 55
      if (!empty($linksOld[1])) {
277 53
        $linksOld = $linksOld[1];
278 53
        foreach ((array)$linksOld as $linkKey => $linkOld) {
279 53
          $linksNew[$linkKey] = \str_replace(
280 53
              self::$domLinkReplaceHelper['orig'],
281 53
              self::$domLinkReplaceHelper['tmp'],
282 53
              $linkOld
283
          );
284
        }
285
      }
286
    }
287
288 79
    $linksNewCount = \count($linksNew);
289 79
    if ($linksNewCount > 0 && \count($linksOld) === $linksNewCount) {
290 53
      $search = \array_merge($linksOld, self::$domReplaceHelper['orig']);
291 53
      $replace = \array_merge($linksNew, self::$domReplaceHelper['tmp']);
292
    } else {
293 29
      $search = self::$domReplaceHelper['orig'];
294 29
      $replace = self::$domReplaceHelper['tmp'];
295
    }
296
297 79
    return \str_replace($search, $replace, $html);
298
  }
299
300
  /**
301
   * @param string $html
302
   *
303
   * @return string
304
   */
305 69
  public static function putReplacedBackToPreserveHtmlEntities(string $html): string
306
  {
307 69
    static $DOM_REPLACE__HELPER_CACHE = null;
308
309 69
    if ($DOM_REPLACE__HELPER_CACHE === null) {
310 1
      $DOM_REPLACE__HELPER_CACHE['tmp'] = \array_merge(
311 1
          self::$domLinkReplaceHelper['tmp'],
312 1
          self::$domReplaceHelper['tmp']
313
      );
314 1
      $DOM_REPLACE__HELPER_CACHE['orig'] = \array_merge(
315 1
          self::$domLinkReplaceHelper['orig'],
316 1
          self::$domReplaceHelper['orig']
317
      );
318
    }
319
320 69
    return \str_replace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
321
  }
322
323
  /**
324
   * Create DOMDocument from HTML.
325
   *
326
   * @param string   $html
327
   * @param int|null $libXMLExtraOptions
328
   *
329
   * @return \DOMDocument
330
   */
331 112
  private function createDOMDocument(string $html, $libXMLExtraOptions = null): \DOMDocument
332
  {
333 112
    if (\strpos($html, '<') === false) {
334 6
      $this->isDOMDocumentCreatedWithoutHtml = true;
335 111
    } elseif (\strpos(\ltrim($html), '<') !== 0) {
336 3
      $this->isDOMDocumentCreatedWithoutWrapper = true;
337
    }
338
339 112
    if (\strpos($html, '<html') === false) {
340 63
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
341
    }
342
343 112
    if (\strpos($html, '<head>') === false) {
344 65
      $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
345
    }
346
347
    // set error level
348 112
    $internalErrors = \libxml_use_internal_errors(true);
349 112
    $disableEntityLoader = \libxml_disable_entity_loader(true);
350 112
    \libxml_clear_errors();
351
352 112
    $optionsXml = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
353
354 112
    if (\defined('LIBXML_BIGLINES')) {
355 112
      $optionsXml |= LIBXML_BIGLINES;
356
    }
357
358 112
    if (\defined('LIBXML_COMPACT')) {
359 112
      $optionsXml |= LIBXML_COMPACT;
360
    }
361
362 112
    if (\defined('LIBXML_HTML_NODEFDTD')) {
363 112
      $optionsXml |= LIBXML_HTML_NODEFDTD;
364
    }
365
366 112
    if ($libXMLExtraOptions !== null) {
367 1
      $optionsXml |= $libXMLExtraOptions;
368
    }
369
370 112
    $sxe = \simplexml_load_string($html, 'SimpleXMLElement', $optionsXml);
371 112
    if ($sxe !== false && \count(\libxml_get_errors()) === 0) {
372 38
      $this->document = \dom_import_simplexml($sxe)->ownerDocument;
373
    } else {
374
375
      // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
376 78
      $html = \trim($html);
377 78
      $xmlHackUsed = false;
378 78
      if (\stripos('<?xml', $html) !== 0) {
379 78
        $xmlHackUsed = true;
380 78
        $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
381
      }
382
383 78
      $html = self::replaceToPreserveHtmlEntities($html);
384
385 78
      $this->document->loadHTML($html, $optionsXml);
386
387
      // remove the "xml-encoding" hack
388 78
      if ($xmlHackUsed === true) {
389 78
        foreach ($this->document->childNodes as $child) {
390 78
          if ($child->nodeType === XML_PI_NODE) {
391 78
            $this->document->removeChild($child);
392
          }
393
        }
394
      }
395
396 78
      \libxml_clear_errors();
397
    }
398
399
    // set encoding
400 112
    $this->document->encoding = $this->getEncoding();
401
402
    // restore lib-xml settings
403 112
    \libxml_use_internal_errors($internalErrors);
404 112
    \libxml_disable_entity_loader($disableEntityLoader);
405
406 112
    return $this->document;
407
  }
408
409
  /**
410
   * Return element by #id.
411
   *
412
   * @param string $id
413
   *
414
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
415
   */
416 2
  public function getElementById(string $id)
417
  {
418 2
    return $this->find("#$id", 0);
419
  }
420
421
  /**
422
   * Return element by tag name.
423
   *
424
   * @param string $name
425
   *
426
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
427
   */
428 1
  public function getElementByTagName(string $name)
429
  {
430 1
    $node = $this->document->getElementsByTagName($name)->item(0);
431
432 1
    if ($node === null) {
433
      return new SimpleHtmlDomNodeBlank();
434
    }
435
436 1
    return new SimpleHtmlDom($node);
437
  }
438
439
  /**
440
   * Returns elements by #id.
441
   *
442
   * @param string   $id
443
   * @param null|int $idx
444
   *
445
   * @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface
446
   */
447
  public function getElementsById(string $id, $idx = null)
448
  {
449
    return $this->find("#$id", $idx);
450
  }
451
452
  /**
453
   * Returns elements by tag name.
454
   *
455
   * @param string   $name
456
   * @param null|int $idx
457
   *
458
   * @return SimpleHtmlDomNode|SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeBlank
459
   */
460 3 View Code Duplication
  public function getElementsByTagName(string $name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
461
  {
462 3
    $nodesList = $this->document->getElementsByTagName($name);
463
464 3
    $elements = new SimpleHtmlDomNode();
465
466 3
    foreach ($nodesList as $node) {
467 3
      $elements[] = new SimpleHtmlDom($node);
468
    }
469
470
    // return all elements
471 3
    if (null === $idx) {
472 2
      return $elements;
473
    }
474
475
    // handle negative values
476 1
    if ($idx < 0) {
477
      $idx = \count($elements) + $idx;
478
    }
479
480
    // return one element
481 1
    if (isset($elements[$idx])) {
482 1
      return $elements[$idx];
483
    }
484
485
    // return a blank-element
486
    return new SimpleHtmlDomNodeBlank();
487
  }
488
489
  /**
490
   * Find one node with a CSS selector.
491
   *
492
   * @param string $selector
493
   *
494
   * @return SimpleHtmlDom|SimpleHtmlDomNodeInterface
495
   */
496 1
  public function findOne(string $selector)
497
  {
498 1
    return $this->find($selector, 0);
499
  }
500
501
  /**
502
   * Find list of nodes with a CSS selector.
503
   *
504
   * @param string $selector
505
   * @param int    $idx
506
   *
507
   * @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface
508
   */
509 82
  public function find(string $selector, $idx = null)
510
  {
511
    /** @noinspection ExceptionsAnnotatingAndHandlingInspection */
512 82
    $xPathQuery = SelectorConverter::toXPath($selector);
513
514 82
    $xPath = new DOMXPath($this->document);
515 82
    $nodesList = $xPath->query($xPathQuery);
516 82
    $elements = new SimpleHtmlDomNode();
517
518 82
    foreach ($nodesList as $node) {
519 78
      $elements[] = new SimpleHtmlDom($node);
520
    }
521
522
    // return all elements
523 82
    if (null === $idx) {
524 53
      return $elements;
525
    }
526
527
    // handle negative values
528 41
    if ($idx < 0) {
529 11
      $idx = \count($elements) + $idx;
530
    }
531
532
    // return one element
533 41
    if (isset($elements[$idx])) {
534 39
      return $elements[$idx];
535
    }
536
537
    // return a blank-element
538 5
    return new SimpleHtmlDomNodeBlank();
539
  }
540
541
  /**
542
   * @param string $content
543
   * @param bool   $multiDecodeNewHtmlEntity
544
   *
545
   * @return string
546
   */
547 60
  public function fixHtmlOutput(string $content, bool $multiDecodeNewHtmlEntity = false): string
548
  {
549
    // INFO: DOMDocument will encapsulate plaintext into a paragraph tag (<p>),
550
    //          so we try to remove it here again ...
551
552 60
    if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
553 23
      $content = \str_replace(
554
          [
555 23
              "\n",
556
              "\r\n",
557
              "\r",
558
              '<body>',
559
              '</body>',
560
              '<html>',
561
              '</html>',
562
          ],
563 23
          '',
564 23
          $content
565
      );
566
    }
567
568 60
    if ($this->isDOMDocumentCreatedWithoutWrapper === true) {
569 2
      $content = (string)\preg_replace('/^<p>/', '', $content);
570 2
      $content = (string)\preg_replace('/<\/p>/', '', $content);
571
    }
572
573 60
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
574 5
      $content = \str_replace(
575
          [
576 5
              '<p>',
577
              '</p>',
578
              '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
579
          ],
580 5
          '',
581 5
          $content
582
      );
583
    }
584
585 60
    $content = \str_replace(
586
        [
587 60
            '<simpleHtmlDomP>',
588
            '</simpleHtmlDomP>',
589
            '<head><head>',
590
            '</head></head>',
591
            '<br></br>',
592
        ],
593
        [
594 60
            '',
595
            '',
596
            '<head>',
597
            '</head>',
598
            '<br>',
599
        ],
600 60
        $content
601
    );
602
603 60
    $content = \trim($content);
604 60
    if ($multiDecodeNewHtmlEntity === true) {
605 2
      if (\class_exists('\voku\helper\UTF8')) {
606
607
        /** @noinspection PhpUndefinedClassInspection */
608
        $content = \voku\helper\UTF8::rawurldecode($content);
609
610
      } else {
611
612
        do {
613 2
          $content_compare = $content;
614
615 2
          $content = \rawurldecode(
616 2
              \html_entity_decode(
617 2
                  $content,
618 2
                  ENT_QUOTES | ENT_HTML5
619
              )
620
          );
621
622 2
        } while ($content_compare !== $content);
623
624
      }
625
626
    } else {
627
628 59
      $content = \rawurldecode(
629 59
          \html_entity_decode(
630 59
              $content,
631 59
              ENT_QUOTES | ENT_HTML5
632
          )
633
      );
634
    }
635
636 60
    $content = self::putReplacedBackToPreserveHtmlEntities($content);
637
638 60
    return $content;
639
  }
640
641
  /**
642
   * @return DOMDocument
643
   */
644 37
  public function getDocument(): \DOMDocument
645
  {
646 37
    return $this->document;
647
  }
648
649
  /**
650
   * Get the encoding to use.
651
   *
652
   * @return string
653
   */
654 124
  private function getEncoding(): string
655
  {
656 124
    return $this->encoding;
657
  }
658
659
  /**
660
   * @return bool
661
   */
662 8
  public function getIsDOMDocumentCreatedWithoutHtml(): bool
663
  {
664 8
    return $this->isDOMDocumentCreatedWithoutHtml;
665
  }
666
667
  /**
668
   * @return bool
669
   */
670 36
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper(): bool
671
  {
672 36
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
673
  }
674
675
  /**
676
   * @return bool
677
   */
678 6
  public function getIsDOMDocumentCreatedWithoutHeadWrapper(): bool
679
  {
680 6
    return $this->isDOMDocumentCreatedWithoutHeadWrapper;
681
  }
682
683
  /**
684
   * @return bool
685
   */
686
  public function getIsDOMDocumentCreatedWithoutWrapper(): bool
687
  {
688
    return $this->isDOMDocumentCreatedWithoutWrapper;
689
  }
690
691
  /**
692
   * Get dom node's outer html.
693
   *
694
   * @param bool $multiDecodeNewHtmlEntity
695
   *
696
   * @return string
697
   */
698 36
  public function html(bool $multiDecodeNewHtmlEntity = false): string
699
  {
700 36
    if ($this::$callback !== null) {
701
      \call_user_func($this::$callback, [$this]);
702
    }
703
704 36
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
705 17
      $content = $this->document->saveHTML($this->document->documentElement);
706
    } else {
707 24
      $content = $this->document->saveHTML();
708
    }
709
710 36
    return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
711
  }
712
713
  /**
714
   * Get the HTML as XML.
715
   *
716
   * @param bool $multiDecodeNewHtmlEntity
717
   *
718
   * @return string
719
   */
720 1
  public function xml(bool $multiDecodeNewHtmlEntity = false): string
721
  {
722 1
    $xml = $this->document->saveXML(null, LIBXML_NOEMPTYTAG);
723
724
    // remove the XML-header
725 1
    $xml = \ltrim((string)\preg_replace('/<\?xml.*\?>/', '', $xml));
726
727 1
    return $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
728
  }
729
730
  /**
731
   * Get dom node's inner html.
732
   *
733
   * @param bool $multiDecodeNewHtmlEntity
734
   *
735
   * @return string
736
   */
737 17
  public function innerHtml(bool $multiDecodeNewHtmlEntity = false): string
738
  {
739
    // init
740 17
    $text = '';
741
742 17
    foreach ($this->document->documentElement->childNodes as $node) {
743 17
      $text .= $this->document->saveHTML($node);
744
    }
745
746 17
    return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
747
  }
748
749
  /**
750
   * Load HTML from string.
751
   *
752
   * @param string   $html
753
   * @param int|null $libXMLExtraOptions
754
   *
755
   * @return HtmlDomParser
756
   *
757
   * @throws InvalidArgumentException if argument is not string
758
   */
759 112
  public function loadHtml(string $html, $libXMLExtraOptions = null): self
760
  {
761 112
    $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
762
763 112
    return $this;
764
  }
765
766
  /**
767
   * Load HTML from file.
768
   *
769
   * @param string   $filePath
770
   * @param int|null $libXMLExtraOptions
771
   *
772
   * @return HtmlDomParser
773
   *
774
   * @throws \RuntimeException
775
   * @throws \InvalidArgumentException
776
   */
777 11
  public function loadHtmlFile(string $filePath, $libXMLExtraOptions = null): self
778
  {
779
    if (
780 11
        !\preg_match("/^https?:\/\//i", $filePath)
781
        &&
782 11
        !\file_exists($filePath)
783
    ) {
784 1
      throw new RuntimeException("File $filePath not found");
785
    }
786
787
    try {
788 10
      if (\class_exists('\voku\helper\UTF8')) {
789
        /** @noinspection PhpUndefinedClassInspection */
790
        $html = \voku\helper\UTF8::file_get_contents($filePath);
791
      } else {
792 10
        $html = \file_get_contents($filePath);
793
      }
794 1
    } catch (\Exception $e) {
795 1
      throw new RuntimeException("Could not load file $filePath");
796
    }
797
798 9
    if ($html === false) {
799
      throw new RuntimeException("Could not load file $filePath");
800
    }
801
802 9
    $this->loadHtml($html, $libXMLExtraOptions);
803
804 9
    return $this;
805
  }
806
807
  /**
808
   * Save the html-dom as string.
809
   *
810
   * @param string $filepath
811
   *
812
   * @return string
813
   */
814 1
  public function save(string $filepath = ''): string
815
  {
816 1
    $string = $this->innerHtml();
817 1
    if ($filepath !== '') {
818
      \file_put_contents($filepath, $string, LOCK_EX);
819
    }
820
821 1
    return $string;
822
  }
823
824
  /**
825
   * @param $functionName
826
   */
827
  public function set_callback($functionName)
828
  {
829
    $this::$callback = $functionName;
830
  }
831
832
  /**
833
   * Get dom node's plain text.
834
   *
835
   * @param bool $multiDecodeNewHtmlEntity
836
   *
837
   * @return string
838
   */
839 2
  public function text(bool $multiDecodeNewHtmlEntity = false): string
840
  {
841 2
    return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
842
  }
843
844
  public function __clone()
845
  {
846
    $this->document = clone $this->document;
847
  }
848
}
849