Completed
Push — master ( 2a517c...a42c86 )
by Lars
04:47
created

HtmlDomParser::replaceToPreserveHtmlEntities()   B

Complexity

Conditions 6
Paths 6

Size

Total Lines 35
Code Lines 21

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 24
CRAP Score 6

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 35
ccs 24
cts 24
cp 1
rs 8.439
cc 6
eloc 21
nc 6
nop 1
crap 6
1
<?php
2
3
namespace voku\helper;
4
5
use BadMethodCallException;
6
use DOMDocument;
7
use DOMXPath;
8
use InvalidArgumentException;
9
use RuntimeException;
10
11
/**
12
 * Class HtmlDomParser
13
 *
14
 * @package voku\helper
15
 *
16
 * @property-read string outerText <p>Get dom node's outer html (alias for "outerHtml").</p>
17
 * @property-read string outerHtml <p>Get dom node's outer html.</p>
18
 * @property-read string innerText <p>Get dom node's inner html (alias for "innerHtml").</p>
19
 * @property-read string innerHtml <p>Get dom node's inner html.</p>
20
 * @property-read string plaintext <p>Get dom node's plain text.</p>
21
 *
22
 * @method string outerText() <p>Get dom node's outer html (alias for "outerHtml()").</p>
23
 * @method string outerHtml() <p>Get dom node's outer html.</p>
24
 * @method string innerText() <p>Get dom node's inner html (alias for "innerHtml()").</p>
25
 *
26
 * @method HtmlDomParser load() load($html) <p>Load HTML from string.</p>
27
 * @method HtmlDomParser load_file() load_file($html) <p>Load HTML from file.</p>
28
 *
29
 * @method static HtmlDomParser file_get_html() file_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from
30
 *         file.</p>
31
 * @method static HtmlDomParser str_get_html() str_get_html($html, $libXMLExtraOptions = null) <p>Load HTML from
32
 *         string.</p>
33
 */
34
class HtmlDomParser
35
{
36
  /**
37
   * @var array
38
   */
39
  protected static $functionAliases = array(
40
      'outertext' => 'html',
41
      'outerhtml' => 'html',
42
      'innertext' => 'innerHtml',
43
      'innerhtml' => 'innerHtml',
44
      'load'      => 'loadHtml',
45
      'load_file' => 'loadHtmlFile',
46
  );
47
48
  /**
49
   * @var string[][]
50
   */
51
  protected static $domLinkReplaceHelper = array(
52
      'orig' => array('[', ']', '{', '}',),
53
      'tmp'  => array(
54
          '!!!!SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_LEFT!!!!',
55
          '!!!!SIMPLE_HTML_DOM__VOKU__SQUARE_BRACKET_RIGHT!!!!',
56
          '!!!!SIMPLE_HTML_DOM__VOKU__BRACKET_LEFT!!!!',
57
          '!!!!SIMPLE_HTML_DOM__VOKU__BRACKET_RIGHT!!!!',
58
      ),
59
  );
60
61
  /**
62
   * @var array
63
   */
64
  protected static $domReplaceHelper = array(
65
      'orig' => array('&', '|', '+', '%'),
66
      'tmp'  => array(
67
          '!!!!SIMPLE_HTML_DOM__VOKU__AMP!!!!',
68
          '!!!!SIMPLE_HTML_DOM__VOKU__PIPE!!!!',
69
          '!!!!SIMPLE_HTML_DOM__VOKU__PLUS!!!!',
70
          '!!!!SIMPLE_HTML_DOM__VOKU__PERCENT!!!!',
71
      ),
72
  );
73
74
  /**
75
   * @var Callable
76
   */
77
  protected static $callback;
78
79
  /**
80
   * @var DOMDocument
81
   */
82
  protected $document;
83
84
  /**
85
   * @var string
86
   */
87
  protected $encoding = 'UTF-8';
88
89
  /**
90
   * @var bool
91
   */
92
  protected $isDOMDocumentCreatedWithoutHtml = false;
93
94
  /**
95
   * @var bool
96
   */
97
  protected $isDOMDocumentCreatedWithoutWrapper = false;
98
99
  /**
100
   * @var bool
101
   */
102
  protected $isDOMDocumentCreatedWithoutHeadWrapper = false;
103
104
  /**
105
   * @var bool
106
   */
107
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
108
109
  /**
110
   * Constructor
111
   *
112
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
113
   *
114
   * @throws \InvalidArgumentException
115
   */
116 123
  public function __construct($element = null)
117
  {
118 123
    $this->document = new \DOMDocument('1.0', $this->getEncoding());
119
120
    // DOMDocument settings
121 123
    $this->document->preserveWhiteSpace = true;
122 123
    $this->document->formatOutput = true;
123
124 123
    if ($element instanceof SimpleHtmlDom) {
125 52
      $element = $element->getNode();
126 52
    }
127
128 123
    if ($element instanceof \DOMNode) {
129 52
      $domNode = $this->document->importNode($element, true);
130
131 52
      if ($domNode instanceof \DOMNode) {
132 52
        $this->document->appendChild($domNode);
133 52
      }
134
135 52
      return;
136
    }
137
138 123
    if ($element !== null) {
139 73
      $this->loadHtml($element);
140 72
    }
141 122
  }
142
143
  /**
144
   * @param $name
145
   * @param $arguments
146
   *
147
   * @return bool|mixed
148
   */
149 42 View Code Duplication
  public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
150
  {
151 42
    $name = strtolower($name);
152
153 42
    if (isset(self::$functionAliases[$name])) {
154 41
      return call_user_func_array(array($this, self::$functionAliases[$name]), $arguments);
155
    }
156
157
    /** @noinspection ExceptionsAnnotatingAndHandlingInspection */
158 1
    throw new BadMethodCallException('Method does not exist: ' . $name);
159
  }
160
161
  /**
162
   * @param $name
163
   * @param $arguments
164
   *
165
   * @return HtmlDomParser
166
   *
167
   * @throws \BadMethodCallException
168
   * @throws \RuntimeException
169
   * @throws \InvalidArgumentException
170
   */
171 17
  public static function __callStatic($name, $arguments)
172
  {
173 17
    $arguments0 = null;
174 17
    if (isset($arguments[0])) {
175 16
      $arguments0 = $arguments[0];
176 16
    }
177
178 17
    $arguments1 = null;
179 17
    if (isset($arguments[1])) {
180 1
      $arguments1 = $arguments[1];
181 1
    }
182
183 17
    if ($name === 'str_get_html') {
184
      /** @noinspection ExceptionsAnnotatingAndHandlingInspection */
185 12
      $parser = new self();
186
187 12
      return $parser->loadHtml($arguments0, $arguments1);
188
    }
189
190 5
    if ($name === 'file_get_html') {
191
      /** @noinspection ExceptionsAnnotatingAndHandlingInspection */
192 4
      $parser = new self();
193
194 4
      return $parser->loadHtmlFile($arguments0, $arguments1);
195
    }
196
197 1
    throw new BadMethodCallException('Method does not exist');
198
  }
199
200
  /** @noinspection MagicMethodsValidityInspection */
201
  /**
202
   * @param $name
203
   *
204
   * @return string
205
   */
206 11
  public function __get($name)
207
  {
208 11
    $name = strtolower($name);
209
210
    switch ($name) {
211 11
      case 'outerhtml':
212 11
      case 'outertext':
213 4
        return $this->html();
214 7
      case 'innerhtml':
215 7
      case 'innertext':
216 5
        return $this->innerHtml();
217 2
      case 'text':
218 2
      case 'plaintext':
219 1
        return $this->text();
220
    }
221
222 1
    return null;
223
  }
224
225
  /**
226
   * @param string $selector
227
   * @param int    $idx
228
   *
229
   * @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface
230
   */
231 3
  public function __invoke($selector, $idx = null)
232
  {
233 3
    return $this->find($selector, $idx);
234
  }
235
236
  /**
237
   * @return string
238
   */
239 15
  public function __toString()
240
  {
241 15
    return $this->html();
242
  }
243
244
  /**
245
   * does nothing (only for api-compatibility-reasons)
246
   *
247
   * @deprecated
248
   *
249
   * @return bool
250
   */
251
  public function clear()
252
  {
253
    return true;
254
  }
255
256
  /**
257
   * @param string $html
258
   *
259
   * @return string
260
   */
261 77
  public static function replaceToPreserveHtmlEntities($html)
262
  {
263
    // init
264 77
    $linksNew = array();
265 77
    $linksOld = array();
266
267 77
    if (strpos($html, 'http') !== false) {
268
269
      // regEx for e.g.: [https://www.domain.de/foo.php?foobar=1&email=lars%40moelleken.org&guid=test1233312&{{foo}}#foo]
270 54
      $regExUrl = '/(\[?\bhttps?:\/\/[^\s<>]+(?:\([\w]+\)|[^[:punct:]\s]|\/|\}|\]))/i';
271 54
      preg_match_all($regExUrl, $html, $linksOld);
272
273 54
      if (!empty($linksOld[1])) {
274 52
        $linksOld = $linksOld[1];
275 52
        foreach ((array)$linksOld as $linkKey => $linkOld) {
276 52
          $linksNew[$linkKey] = str_replace(
277 52
              self::$domLinkReplaceHelper['orig'],
278 52
              self::$domLinkReplaceHelper['tmp'],
279
              $linkOld
280 52
          );
281 52
        }
282 52
      }
283 54
    }
284
285 77
    $linksNewCount = count($linksNew);
286 77
    if ($linksNewCount > 0 && count($linksOld) === $linksNewCount) {
287 52
      $search = array_merge($linksOld, self::$domReplaceHelper['orig']);
288 52
      $replace = array_merge($linksNew, self::$domReplaceHelper['tmp']);
289 52
    } else {
290 28
      $search = self::$domReplaceHelper['orig'];
291 28
      $replace = self::$domReplaceHelper['tmp'];
292
    }
293
294 77
    return str_replace($search, $replace, $html);
295
  }
296
297
  /**
298
   * @param string $html
299
   *
300
   * @return string
301
   */
302 61
  public static function putReplacedBackToPreserveHtmlEntities($html)
303
  {
304 61
    static $DOM_REPLACE__HELPER_CACHE = null;
305
306 61
    if ($DOM_REPLACE__HELPER_CACHE === null) {
307 1
      $DOM_REPLACE__HELPER_CACHE['tmp'] = array_merge(
308 1
          self::$domLinkReplaceHelper['tmp'],
309 1
          self::$domReplaceHelper['tmp']
310 1
      );
311 1
      $DOM_REPLACE__HELPER_CACHE['orig'] = array_merge(
312 1
          self::$domLinkReplaceHelper['orig'],
313 1
          self::$domReplaceHelper['orig']
314 1
      );
315 1
    }
316
317 61
    return str_replace($DOM_REPLACE__HELPER_CACHE['tmp'], $DOM_REPLACE__HELPER_CACHE['orig'], $html);
318
  }
319
320
  /**
321
   * Create DOMDocument from HTML.
322
   *
323
   * @param string   $html
324
   * @param int|null $libXMLExtraOptions
325
   *
326
   * @return \DOMDocument
327
   */
328 111
  private function createDOMDocument($html, $libXMLExtraOptions = null)
329
  {
330 111
    if (strpos($html, '<') === false) {
331 6
      $this->isDOMDocumentCreatedWithoutHtml = true;
332 111
    } elseif (strpos(ltrim($html), '<') !== 0) {
333 3
      $this->isDOMDocumentCreatedWithoutWrapper = true;
334 3
    }
335
336 111
    if (strpos($html, '<html') === false) {
337 63
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
338 63
    }
339
340 111
    if (strpos($html, '<head>') === false) {
341 65
      $this->isDOMDocumentCreatedWithoutHeadWrapper = true;
342 65
    }
343
344
    // set error level
345 111
    $internalErrors = libxml_use_internal_errors(true);
346 111
    $disableEntityLoader = libxml_disable_entity_loader(true);
347 111
    libxml_clear_errors();
348
349 111
    $optionsSimpleXml = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
350 111
    $optionsXml = 0;
351
352 111
    if (defined('LIBXML_BIGLINES')) {
353
      $optionsSimpleXml |= LIBXML_BIGLINES;
354
    }
355
356 111
    if (defined('LIBXML_COMPACT')) {
357 111
      $optionsSimpleXml |= LIBXML_COMPACT;
358 111
    }
359
360 111
    if (defined('LIBXML_HTML_NOIMPLIED')) {
361 111
      $optionsSimpleXml |= LIBXML_HTML_NOIMPLIED;
362 111
    }
363
364 111
    if (defined('LIBXML_HTML_NODEFDTD')) {
365 111
      $optionsSimpleXml |= LIBXML_HTML_NODEFDTD;
366 111
    }
367
368 111
    if ($libXMLExtraOptions !== null) {
369 1
      $optionsSimpleXml |= $libXMLExtraOptions;
370 1
      $optionsXml |= $libXMLExtraOptions;
371 1
    }
372
373 111
    $sxe = simplexml_load_string($html, 'SimpleXMLElement', $optionsSimpleXml);
374 111
    if ($sxe !== false && count(libxml_get_errors()) === 0) {
375 39
      $this->document = dom_import_simplexml($sxe)->ownerDocument;
376 39
    } else {
377
378
      // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
379 76
      $html = trim($html);
380 76
      $xmlHackUsed = false;
381 76
      if (stripos('<?xml', $html) !== 0) {
382 76
        $xmlHackUsed = true;
383 76
        $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
384 76
      }
385
386 76
      $html = self::replaceToPreserveHtmlEntities($html);
387
388 76
      if ($optionsXml && Bootup::is_php('5.4')) {
389 1
        $this->document->loadHTML($html, $optionsXml);
390 1
      } else {
391 76
        $this->document->loadHTML($html);
392
      }
393
394
      // remove the "xml-encoding" hack
395 76
      if ($xmlHackUsed === true) {
396 76
        foreach ($this->document->childNodes as $child) {
397 76
          if ($child->nodeType === XML_PI_NODE) {
398 76
            $this->document->removeChild($child);
399 76
          }
400 76
        }
401 76
      }
402
403 76
      libxml_clear_errors();
404
    }
405
406
    // set encoding
407 111
    $this->document->encoding = $this->getEncoding();
408
409
    // restore lib-xml settings
410 111
    libxml_use_internal_errors($internalErrors);
411 111
    libxml_disable_entity_loader($disableEntityLoader);
412
413 111
    return $this->document;
414
  }
415
416
  /**
417
   * Return element by #id.
418
   *
419
   * @param string $id
420
   *
421
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
422
   */
423 2
  public function getElementById($id)
424
  {
425 2
    return $this->find("#$id", 0);
426
  }
427
428
  /**
429
   * Return element by tag name.
430
   *
431
   * @param string $name
432
   *
433
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
434
   */
435 1
  public function getElementByTagName($name)
436
  {
437 1
    $node = $this->document->getElementsByTagName($name)->item(0);
438
439 1
    if ($node === null) {
440
      return new SimpleHtmlDomNodeBlank();
441
    }
442
443 1
    return new SimpleHtmlDom($node);
444
  }
445
446
  /**
447
   * Returns elements by #id.
448
   *
449
   * @param string   $id
450
   * @param null|int $idx
451
   *
452
   * @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface
453
   */
454
  public function getElementsById($id, $idx = null)
455
  {
456
    return $this->find("#$id", $idx);
457
  }
458
459
  /**
460
   * Returns elements by tag name.
461
   *
462
   * @param string   $name
463
   * @param null|int $idx
464
   *
465
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
466
   */
467 3 View Code Duplication
  public function getElementsByTagName($name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
468
  {
469 3
    $nodesList = $this->document->getElementsByTagName($name);
470
471 3
    $elements = new SimpleHtmlDomNode();
472
473 3
    foreach ($nodesList as $node) {
474 3
      $elements[] = new SimpleHtmlDom($node);
475 3
    }
476
477
    // return all elements
478 3
    if (null === $idx) {
479 2
      return $elements;
480
    }
481
482
    // handle negative values
483 1
    if ($idx < 0) {
484
      $idx = count($elements) + $idx;
485
    }
486
487
    // return one element
488 1
    if (isset($elements[$idx])) {
489 1
      return $elements[$idx];
490
    }
491
492
    // return a blank-element
493
    return new SimpleHtmlDomNodeBlank();
494
  }
495
496
  /**
497
   * Find list of nodes with a CSS selector.
498
   *
499
   * @param string $selector
500
   * @param int    $idx
501
   *
502
   * @return SimpleHtmlDom[]|SimpleHtmlDom|SimpleHtmlDomNodeInterface
503
   */
504 81
  public function find($selector, $idx = null)
505
  {
506
    /** @noinspection ExceptionsAnnotatingAndHandlingInspection */
507 81
    $xPathQuery = SelectorConverter::toXPath($selector);
508
509 81
    $xPath = new DOMXPath($this->document);
510 81
    $nodesList = $xPath->query($xPathQuery);
511 81
    $elements = new SimpleHtmlDomNode();
512
513 81
    foreach ($nodesList as $node) {
514 77
      $elements[] = new SimpleHtmlDom($node);
515 81
    }
516
517
    // return all elements
518 81
    if (null === $idx) {
519 52
      return $elements;
520
    }
521
522
    // handle negative values
523 41
    if ($idx < 0) {
524 11
      $idx = count($elements) + $idx;
525 11
    }
526
527
    // return one element
528 41
    if (isset($elements[$idx])) {
529 39
      return $elements[$idx];
530
    }
531
532
    // return a blank-element
533 5
    return new SimpleHtmlDomNodeBlank();
534
  }
535
536
  /**
537
   * @param string $content
538
   * @param bool   $multiDecodeNewHtmlEntity
539
   *
540
   * @return string
541
   */
542 50
  protected function fixHtmlOutput($content, $multiDecodeNewHtmlEntity = false)
543
  {
544
    // INFO: DOMDocument will encapsulate plaintext into a paragraph tag (<p>),
545
    //          so we try to remove it here again ...
546
547 50
    if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
548 23
      $content = str_replace(
549
          array(
550 23
              "\n",
551 23
              "\r\n",
552 23
              "\r",
553 23
              '<body>',
554 23
              '</body>',
555 23
              '<html>',
556 23
              '</html>',
557 23
          ),
558 23
          '',
559
          $content
560 23
      );
561 23
    }
562
563 50
    if ($this->isDOMDocumentCreatedWithoutWrapper === true) {
564 2
      $content = preg_replace('/^<p>/', '', $content);
565 2
      $content = preg_replace('/<\/p>/', '', $content);
566 2
    }
567
568 50
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
569 5
      $content = str_replace(
570
          array(
571 5
              '<p>',
572 5
              '</p>',
573 5
              '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">',
574 5
          ),
575 5
          '',
576
          $content
577 5
      );
578 5
    }
579
580 50
    $content = str_replace(
581
        array(
582 50
            '<simpleHtmlDomP>',
583 50
            '</simpleHtmlDomP>',
584 50
            '<head><head>',
585 50
            '</head></head>',
586 50
        ),
587
        array(
588 50
            '',
589 50
            '',
590 50
            '<head>',
591 50
            '</head>',
592 50
        ),
593
        $content
594 50
    );
595
596 50
    $content = trim($content);
597
598 50
    if ($multiDecodeNewHtmlEntity === true) {
599 2
      $content = UTF8::rawurldecode($content);
600 2
    } else {
601 49
      $content = rawurldecode($content);
602
    }
603
604 50
    $content = self::putReplacedBackToPreserveHtmlEntities($content);
605
606 50
    return $content;
607
  }
608
609
  /**
610
   * @return DOMDocument
611
   */
612 37
  public function getDocument()
613
  {
614 37
    return $this->document;
615
  }
616
617
  /**
618
   * Get the encoding to use.
619
   *
620
   * @return string
621
   */
622 123
  private function getEncoding()
623
  {
624 123
    return $this->encoding;
625
  }
626
627
  /**
628
   * @return bool
629
   */
630 8
  public function getIsDOMDocumentCreatedWithoutHtml()
631
  {
632 8
    return $this->isDOMDocumentCreatedWithoutHtml;
633
  }
634
635
  /**
636
   * @return bool
637
   */
638 36
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper()
639
  {
640 36
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
641
  }
642
643
  /**
644
   * @return bool
645
   */
646 6
  public function getIsDOMDocumentCreatedWithoutHeadWrapper()
647
  {
648 6
    return $this->isDOMDocumentCreatedWithoutHeadWrapper;
649
  }
650
651
  /**
652
   * @return bool
653
   */
654
  public function getIsDOMDocumentCreatedWithoutWrapper()
655
  {
656
    return $this->isDOMDocumentCreatedWithoutWrapper;
657
  }
658
659
  /**
660
   * Get dom node's outer html.
661
   *
662
   * @param bool $multiDecodeNewHtmlEntity
663
   *
664
   * @return string
665
   */
666 36
  public function html($multiDecodeNewHtmlEntity = false)
667
  {
668 36
    if ($this::$callback !== null) {
669
      call_user_func($this::$callback, array($this));
670
    }
671
672 36
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
673 17
      $content = $this->document->saveHTML($this->document->documentElement);
674 17
    } else {
675 24
      $content = $this->document->saveHTML();
676
    }
677
678 36
    return $this->fixHtmlOutput($content, $multiDecodeNewHtmlEntity);
679
  }
680
681
  /**
682
   * Get the HTML as XML.
683
   *
684
   * @param bool $multiDecodeNewHtmlEntity
685
   *
686
   * @return string
687
   */
688 1
  public function xml($multiDecodeNewHtmlEntity = false)
689
  {
690 1
    $xml = $this->document->saveXML(null, LIBXML_NOEMPTYTAG);
691
692
    // remove the XML-header
693 1
    $xml = ltrim(preg_replace('/<\?xml.*\?>/', '', $xml));
694
695 1
    return $this->fixHtmlOutput($xml, $multiDecodeNewHtmlEntity);
696
  }
697
698
  /**
699
   * Get dom node's inner html.
700
   *
701
   * @param bool $multiDecodeNewHtmlEntity
702
   *
703
   * @return string
704
   */
705 17
  public function innerHtml($multiDecodeNewHtmlEntity = false)
706
  {
707 17
    $text = '';
708
709 17
    foreach ($this->document->documentElement->childNodes as $node) {
710 17
      $text .= $this->document->saveHTML($node);
711 17
    }
712
713 17
    return $this->fixHtmlOutput($text, $multiDecodeNewHtmlEntity);
714
  }
715
716
  /**
717
   * Load HTML from string.
718
   *
719
   * @param string   $html
720
   * @param int|null $libXMLExtraOptions
721
   *
722
   * @return HtmlDomParser
723
   *
724
   * @throws InvalidArgumentException if argument is not string
725
   */
726 114
  public function loadHtml($html, $libXMLExtraOptions = null)
727
  {
728 114
    if (!is_string($html)) {
729 3
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
730
    }
731
732 111
    $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
733
734 111
    return $this;
735
  }
736
737
  /**
738
   * Load HTML from file.
739
   *
740
   * @param string   $filePath
741
   * @param int|null $libXMLExtraOptions
742
   *
743
   * @return HtmlDomParser
744
   *
745
   * @throws \RuntimeException
746
   * @throws \InvalidArgumentException
747
   */
748 12
  public function loadHtmlFile($filePath, $libXMLExtraOptions = null)
749
  {
750 12
    if (!is_string($filePath)) {
751 2
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
752
    }
753
754 10
    if (!preg_match("/^https?:\/\//i", $filePath) && !file_exists($filePath)) {
755 1
      throw new RuntimeException("File $filePath not found");
756
    }
757
758
    try {
759 9
      $html = UTF8::file_get_contents($filePath);
760 9
    } catch (\Exception $e) {
761 1
      throw new RuntimeException("Could not load file $filePath");
762
    }
763
764 8
    if ($html === false) {
765
      throw new RuntimeException("Could not load file $filePath");
766
    }
767
768 8
    $this->loadHtml($html, $libXMLExtraOptions);
769
770 8
    return $this;
771
  }
772
773
  /**
774
   * Save the html-dom as string.
775
   *
776
   * @param string $filepath
777
   *
778
   * @return string
779
   */
780 1
  public function save($filepath = '')
781
  {
782 1
    $string = $this->innerHtml();
783 1
    if ($filepath !== '') {
784
      file_put_contents($filepath, $string, LOCK_EX);
785
    }
786
787 1
    return $string;
788
  }
789
790
  /**
791
   * @param $functionName
792
   */
793
  public function set_callback($functionName)
794
  {
795
    $this::$callback = $functionName;
796
  }
797
798
  /**
799
   * Get dom node's plain text.
800
   *
801
   * @param bool $multiDecodeNewHtmlEntity
802
   *
803
   * @return string
804
   */
805 2
  public function text($multiDecodeNewHtmlEntity = false)
806
  {
807 2
    return $this->fixHtmlOutput($this->document->textContent, $multiDecodeNewHtmlEntity);
808
  }
809
}
810