Completed
Push — master ( 9cf0fc...e483df )
by Lars
02:51
created

HtmlDomParser   D

Complexity

Total Complexity 84

Size/Duplication

Total Lines 698
Duplicated Lines 3.87 %

Coupling/Cohesion

Components 1
Dependencies 6

Test Coverage

Coverage 94.85%

Importance

Changes 1
Bugs 0 Features 0
Metric Value
wmc 84
c 1
b 0
f 0
lcom 1
cbo 6
dl 27
loc 698
ccs 258
cts 272
cp 0.9485
rs 4.4536

29 Methods

Rating   Name   Duplication   Size   Complexity  
B __construct() 0 29 5
A addRandBytesToDomReplaceHelpers() 0 10 3
A __call() 3 10 2
B __callStatic() 0 26 5
B __get() 0 18 7
A __invoke() 0 4 1
A __toString() 0 4 1
A clear() 0 4 1
B replaceToPreserveHtmlEntities() 0 32 6
A putReplacedBackToPreserveHtmlEntities() 0 16 1
D createDOMDocument() 0 71 13
A getElementById() 0 4 1
A getElementByTagName() 0 10 2
A getElementsById() 0 4 1
B getElementsByTagName() 24 24 5
B find() 0 26 5
B fixHtmlOutput() 0 42 3
A getDocument() 0 4 1
A getEncoding() 0 4 1
A getIsDOMDocumentCreatedWithoutHtml() 0 4 1
A getIsDOMDocumentCreatedWithoutHtmlWrapper() 0 4 1
A html() 0 14 3
A xml() 0 9 1
A innerHtml() 0 10 2
A loadHtml() 0 10 2
B loadHtmlFile() 0 25 6
A save() 0 9 2
A set_callback() 0 4 1
A text() 0 4 1

How to fix   Duplicated Code    Complexity   

Duplicated Code

Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.

Common duplication problems, and corresponding solutions are:

Complex Class

 Tip:   Before tackling complexity, make sure that you eliminate any duplication first. This often can reduce the size of classes significantly.

Complex classes like HtmlDomParser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use HtmlDomParser, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
namespace voku\helper;
4
5
use BadMethodCallException;
6
use DOMDocument;
7
use DOMXPath;
8
use InvalidArgumentException;
9
use RuntimeException;
10
11
/**
12
 * Class HtmlDomParser
13
 *
14
 * @package voku\helper
15
 *
16
 * @property-read string outerText Get dom node's outer html (alias for "outerHtml")
17
 * @property-read string outerHtml Get dom node's outer html
18
 * @property-read string innerText Get dom node's inner html (alias for "innerHtml")
19
 * @property-read string innerHtml Get dom node's inner html
20
 * @property-read string plaintext Get dom node's plain text
21
 *
22
 * @method string outerText() Get dom node's outer html (alias for "outerHtml()")
23
 * @method string outerHtml() Get dom node's outer html
24
 * @method string innerText() Get dom node's inner html (alias for "innerHtml()")
25
 * @method HtmlDomParser load() load($html) Load HTML from string
26
 * @method HtmlDomParser load_file() load_file($html) Load HTML from file
27
 *
28
 * @method static HtmlDomParser file_get_html() file_get_html($html, $libXMLExtraOptions = null) Load HTML from file
29
 * @method static HtmlDomParser str_get_html() str_get_html($html, $libXMLExtraOptions = null) Load HTML from string
30
 */
31
class HtmlDomParser
32
{
33
  /**
34
   * @var array
35
   */
36
  protected static $functionAliases = array(
37
      'outertext' => 'html',
38
      'outerhtml' => 'html',
39
      'innertext' => 'innerHtml',
40
      'innerhtml' => 'innerHtml',
41
      'load'      => 'loadHtml',
42
      'load_file' => 'loadHtmlFile',
43
  );
44
45
  /**
46
   * @var array
47
   */
48
  protected static $domLinkReplaceHelper = array(
49
      'orig' => array('[', ']', '{', '}',),
50
      'tmp'  => array(
51
          '!!!!HTML_DOM__SQUARE_BRACKET_LEFT!!!!',
52
          '!!!!HTML_DOM__SQUARE_BRACKET_RIGHT!!!!',
53
          '!!!!HTML_DOM__BRACKET_LEFT!!!!',
54
          '!!!!HTML_DOM__BRACKET_RIGHT!!!!',
55
      ),
56
  );
57
58
  /**
59
   * @var array
60
   */
61
  protected static $domReplaceHelper = array(
62
      'orig' => array('&', '|', '+', '%'),
63
      'tmp'  => array(
64
          '!!!!HTML_DOM__AMP!!!!',
65
          '!!!!HTML_DOM__PIPE!!!!',
66
          '!!!!HTML_DOM__PLUS!!!!',
67
          '!!!!HTML_DOM__PERCENT!!!!',
68
      ),
69
  );
70
71
  /**
72
   * @var Callable
73
   */
74
  protected static $callback;
75
76
  /**
77
   * @var DOMDocument
78
   */
79
  protected $document;
80
81
  /**
82
   * @var string
83
   */
84
  protected $encoding = 'UTF-8';
85
86
  /**
87
   * @var bool
88
   */
89
  protected $isDOMDocumentCreatedWithoutHtml = false;
90
91
  /**
92
   * @var bool
93
   */
94
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
95
96
  /**
97
   * An random md5-hash, generated via "random_bytes()".
98
   *
99
   * @var string
100
   */
101
  protected $randomHash;
102
103
  /**
104
   * Constructor
105
   *
106
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
107
   */
108 118
  public function __construct($element = null)
109
  {
110 118
    $this->randomHash = md5(Bootup::get_random_bytes(16));
111 118
    $this->document = new \DOMDocument('1.0', $this->getEncoding());
112
113 118
    $this->addRandBytesToDomReplaceHelpers();
114
115
    // DOMDocument settings
116 118
    $this->document->preserveWhiteSpace = true;
117 118
    $this->document->formatOutput = true;
118
119 118
    if ($element instanceof SimpleHtmlDom) {
120 51
      $element = $element->getNode();
121 51
    }
122
123 118
    if ($element instanceof \DOMNode) {
124 51
      $domNode = $this->document->importNode($element, true);
125
126 51
      if ($domNode instanceof \DOMNode) {
127 51
        $this->document->appendChild($domNode);
128 51
      }
129
130 51
      return;
131
    }
132
133 118
    if ($element !== null) {
134 70
      $this->loadHtml($element);
135 69
    }
136 117
  }
137
138
  /**
139
   * Add rand-bytes to the "Dom-Replace-Helper"-variables.
140
   */
141 118
  protected function addRandBytesToDomReplaceHelpers()
142
  {
143 118
    foreach (self::$domLinkReplaceHelper['tmp'] as &$linkHelper) {
144 118
      $linkHelper .= $this->randomHash;
145 118
    }
146
147 118
    foreach (self::$domReplaceHelper['tmp'] as &$domHelper) {
148 118
      $domHelper .= $this->randomHash;
149 118
    }
150 118
  }
151
152
  /**
153
   * @param $name
154
   * @param $arguments
155
   *
156
   * @return bool|mixed
157
   */
158 33
  public function __call($name, $arguments)
159
  {
160 33
    $name = strtolower($name);
161
162 33 View Code Duplication
    if (isset(self::$functionAliases[$name])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
163 32
      return call_user_func_array(array($this, self::$functionAliases[$name]), $arguments);
164
    }
165
166 1
    throw new BadMethodCallException('Method does not exist: ' . $name);
167
  }
168
169
  /**
170
   * @param $name
171
   * @param $arguments
172
   *
173
   * @return HtmlDomParser
174
   */
175 14
  public static function __callStatic($name, $arguments)
176
  {
177 14
    $arguments0 = null;
178 14
    if (isset($arguments[0])) {
179 13
      $arguments0 = $arguments[0];
180 13
    }
181
182 14
    $arguments1 = null;
183 14
    if (isset($arguments[1])) {
184 1
      $arguments1 = $arguments[1];
185 1
    }
186
187 14
    if ($name == 'str_get_html') {
188 9
      $parser = new self();
189
190 9
      return $parser->loadHtml($arguments0, $arguments1);
191
    }
192
193 5
    if ($name == 'file_get_html') {
194 4
      $parser = new self();
195
196 4
      return $parser->loadHtmlFile($arguments0, $arguments1);
197
    }
198
199 1
    throw new BadMethodCallException('Method does not exist');
200
  }
201
202
  /**
203
   * @param $name
204
   *
205
   * @return string
206
   */
207 13
  public function __get($name)
208
  {
209 13
    $name = strtolower($name);
210
211
    switch ($name) {
212 13
      case 'outerhtml':
213 13
      case 'outertext':
214 7
        return $this->html();
215 6
      case 'innerhtml':
216 6
      case 'innertext':
217 4
        return $this->innerHtml();
218 2
      case 'text':
219 2
      case 'plaintext':
220 1
        return $this->text();
221
    }
222
223 1
    return null;
224
  }
225
226
  /**
227
   * @param string $selector
228
   * @param int    $idx
229
   *
230
   * @return SimpleHtmlDom|SimpleHtmlDomNode|null
231
   */
232 3
  public function __invoke($selector, $idx = null)
233
  {
234 3
    return $this->find($selector, $idx);
235
  }
236
237
  /**
238
   * @return string
239
   */
240 14
  public function __toString()
241
  {
242 14
    return $this->html();
243
  }
244
245
  /**
246
   * does nothing (only for api-compatibility-reasons)
247
   *
248
   * @return bool
249
   */
250 1
  public function clear()
251
  {
252 1
    return true;
253
  }
254
255
  /**
256
   * @param string $html
257
   *
258
   * @return string
259
   */
260 71
  public static function replaceToPreserveHtmlEntities($html)
261
  {
262
    // init
263 71
    $linksNew = array();
264 71
    $linksOld = array();
265
266 71
    if (strpos($html, 'http') !== false) {
267 49
      preg_match_all("/(\bhttps?:\/\/[^\s()<>]+(?:\([\w\d]+\)|[^[:punct:]\s]|\/|\}|\]))/i", $html, $linksOld);
268
269 49
      if (!empty($linksOld[1])) {
270 49
        $linksOld = $linksOld[1];
271 49
        foreach ($linksOld as $linkKey => $linkOld) {
272 49
          $linksNew[$linkKey] = str_replace(
273 49
              self::$domLinkReplaceHelper['orig'],
274 49
              self::$domLinkReplaceHelper['tmp'],
275
              $linkOld
276 49
          );
277 49
        }
278 49
      }
279 49
    }
280
281 71
    $linksNewCount = count($linksNew);
282 71
    if ($linksNewCount > 0 && count($linksOld) === $linksNewCount) {
283 49
      $search = array_merge($linksOld, self::$domReplaceHelper['orig']);
284 49
      $replace = array_merge($linksNew, self::$domReplaceHelper['tmp']);
285 49
    } else {
286 23
      $search = self::$domReplaceHelper['orig'];
287 23
      $replace = self::$domReplaceHelper['tmp'];
288
    }
289
290 71
    return str_replace($search, $replace, $html);
291
  }
292
293
  /**
294
   * @param string $html
295
   *
296
   * @return string
297
   */
298 55
  public static function putReplacedBackToPreserveHtmlEntities($html)
299
  {
300 55
    return str_replace(
301 55
        array_merge(
302 55
            self::$domLinkReplaceHelper['tmp'],
303 55
            self::$domReplaceHelper['tmp'],
304 55
            array('&#13;')
305 55
        ),
306 55
        array_merge(
307 55
            self::$domLinkReplaceHelper['orig'],
308 55
            self::$domReplaceHelper['orig'],
309 55
            array('')
310 55
        ),
311
        $html
312 55
    );
313
  }
314
315
  /**
316
   * create DOMDocument from HTML
317
   *
318
   * @param string   $html
319
   * @param int|null $libXMLExtraOptions
320
   *
321
   * @return \DOMDocument
322
   */
323 106
  private function createDOMDocument($html, $libXMLExtraOptions = null)
324
  {
325 106
    if (strpos($html, '<') === false) {
326 6
      $this->isDOMDocumentCreatedWithoutHtml = true;
327 6
    }
328
329 106
    if (strpos($html, '<html') === false) {
330 58
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
331 58
    }
332
333
    // set error level
334 106
    $internalErrors = libxml_use_internal_errors(true);
335 106
    $disableEntityLoader = libxml_disable_entity_loader(true);
336 106
    libxml_clear_errors();
337
338 106
    $options = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
339
340 106
    if (defined('LIBXML_COMPACT')) {
341 106
      $options |= LIBXML_COMPACT;
342 106
    }
343
344 106
    if (defined('LIBXML_HTML_NOIMPLIED')) {
345 106
      $options |= LIBXML_HTML_NOIMPLIED;
346 106
    }
347
348 106
    if (defined('LIBXML_HTML_NODEFDTD')) {
349 106
      $options |= LIBXML_HTML_NODEFDTD;
350 106
    }
351
352 106
    if ($libXMLExtraOptions !== null) {
353 1
      $options |= $libXMLExtraOptions;
354 1
    }
355
356 106
    $sxe = simplexml_load_string($html, 'SimpleXMLElement', $options);
357 106
    if ($sxe !== false && count(libxml_get_errors()) === 0) {
358 37
      $this->document = dom_import_simplexml($sxe)->ownerDocument;
359 37
    } else {
360
361
      // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
362 71
      $html = trim($html);
363 71
      $xmlHackUsed = false;
364 71
      if (stripos('<?xml', $html) !== 0) {
365 71
        $xmlHackUsed = true;
366 71
        $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
367 71
      }
368
369 71
      $html = self::replaceToPreserveHtmlEntities($html);
370
371 71
      $this->document->loadHTML($html);
372
373
      // remove the "xml-encoding" hack
374 71
      if ($xmlHackUsed === true) {
375 71
        foreach ($this->document->childNodes as $child) {
376 71
          if ($child->nodeType == XML_PI_NODE) {
377 71
            $this->document->removeChild($child);
378 71
          }
379 71
        }
380 71
      }
381
382 71
      libxml_clear_errors();
383
    }
384
385
    // set encoding
386 106
    $this->document->encoding = $this->getEncoding();
387
388
    // restore lib-xml settings
389 106
    libxml_use_internal_errors($internalErrors);
390 106
    libxml_disable_entity_loader($disableEntityLoader);
391
392 106
    return $this->document;
393
  }
394
395
  /**
396
   * Return SimpleHtmlDom by id.
397
   *
398
   * @param string $id
399
   *
400
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
401
   */
402 2
  public function getElementById($id)
403
  {
404 2
    return $this->find("#$id", 0);
405
  }
406
407
  /**
408
   * Return SimpleHtmlDom by tag name.
409
   *
410
   * @param string $name
411
   *
412
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
413
   */
414 1
  public function getElementByTagName($name)
415
  {
416 1
    $node = $this->document->getElementsByTagName($name)->item(0);
417
418 1
    if ($node !== null) {
419 1
      return new SimpleHtmlDom($node);
420
    } else {
421
      return new SimpleHtmlDomNodeBlank();
422
    }
423
  }
424
425
  /**
426
   * Returns Elements by id
427
   *
428
   * @param string   $id
429
   * @param null|int $idx
430
   *
431
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
432
   */
433
  public function getElementsById($id, $idx = null)
434
  {
435
    return $this->find("#$id", $idx);
436
  }
437
438
  /**
439
   * Returns Elements by tag name
440
   *
441
   * @param string   $name
442
   * @param null|int $idx
443
   *
444
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
445
   */
446 3 View Code Duplication
  public function getElementsByTagName($name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
447
  {
448 3
    $nodesList = $this->document->getElementsByTagName($name);
449
450 3
    $elements = new SimpleHtmlDomNode();
451
452 3
    foreach ($nodesList as $node) {
453 3
      $elements[] = new SimpleHtmlDom($node);
454 3
    }
455
456 3
    if (null === $idx) {
457 2
      return $elements;
458
    } else {
459 1
      if ($idx < 0) {
460
        $idx = count($elements) + $idx;
461
      }
462
    }
463
464 1
    if (isset($elements[$idx])) {
465 1
      return $elements[$idx];
466
    } else {
467
      return new SimpleHtmlDomNodeBlank();
468
    }
469
  }
470
471
  /**
472
   * Find list of nodes with a CSS selector.
473
   *
474
   * @param string $selector
475
   * @param int    $idx
476
   *
477
   * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeBlank
478
   */
479 78
  public function find($selector, $idx = null)
480
  {
481 78
    $xPathQuery = SelectorConverter::toXPath($selector);
482
483 78
    $xPath = new DOMXPath($this->document);
484 78
    $nodesList = $xPath->query($xPathQuery);
485 78
    $elements = new SimpleHtmlDomNode();
486
487 78
    foreach ($nodesList as $node) {
488 74
      $elements[] = new SimpleHtmlDom($node);
489 78
    }
490
491 78
    if (null === $idx) {
492 51
      return $elements;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $elements; (voku\helper\SimpleHtmlDomNode) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\SimpleHtmlDomNodeBlank.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
493
    } else {
494 39
      if ($idx < 0) {
495 11
        $idx = count($elements) + $idx;
496 11
      }
497
    }
498
499 39
    if (isset($elements[$idx])) {
500 36
      return $elements[$idx];
501
    } else {
502 5
      return new SimpleHtmlDomNodeBlank();
503
    }
504
  }
505
506
  /**
507
   * @param string $content
508
   *
509
   * @return string
510
   */
511 46
  protected function fixHtmlOutput($content)
512
  {
513
    // INFO: DOMDocument will encapsulate plaintext into a paragraph tag (<p>),
514
    //          so we try to remove it here again ...
515
516 46
    if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
517 19
      $content = str_replace(
518
          array(
519 19
              "\n",
520 19
              "\r\n",
521 19
              "\r",
522 19
              '<simpleHtmlDomP>',
523 19
              '</simpleHtmlDomP>',
524 19
              '<body>',
525 19
              '</body>',
526 19
              '<html>',
527 19
              '</html>',
528 19
          ),
529 19
          '',
530
          $content
531 19
      );
532 19
    }
533
534 46
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
535 5
      $content = str_replace(
536
          array(
537 5
              '<p>',
538 5
              '</p>',
539
              '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">'
540 5
          ),
541 5
          '',
542 5
          $content);
543 5
    }
544
545 46
    $content = UTF8::html_entity_decode($content);
546 46
    $content = trim($content);
547 46
    $content = UTF8::rawurldecode($content);
548
549 46
    $content = self::putReplacedBackToPreserveHtmlEntities($content);
550
551 46
    return $content;
552
  }
553
554
  /**
555
   * @return DOMDocument
556
   */
557 35
  public function getDocument()
558
  {
559 35
    return $this->document;
560
  }
561
562
  /**
563
   * Get the encoding to use
564
   *
565
   * @return string
566
   */
567 118
  private function getEncoding()
568
  {
569 118
    return $this->encoding;
570
  }
571
572
  /**
573
   * @return bool
574
   */
575 6
  public function getIsDOMDocumentCreatedWithoutHtml()
576
  {
577 6
    return $this->isDOMDocumentCreatedWithoutHtml;
578
  }
579
580
  /**
581
   * @return bool
582
   */
583 33
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper()
584
  {
585 33
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
586
  }
587
588
  /**
589
   * Get dom node's outer html
590
   *
591
   * @return string
592
   */
593 33
  public function html()
594
  {
595 33
    if ($this::$callback !== null) {
596
      call_user_func_array($this::$callback, array($this));
597
    }
598
599 33
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
600 14
      $content = $this->document->saveHTML($this->document->documentElement);
601 14
    } else {
602 22
      $content = $this->document->saveHTML();
603
    }
604
605 33
    return $this->fixHtmlOutput($content);
606
  }
607
608
  /**
609
   * Get the HTML as XML.
610
   *
611
   * @return string
612
   */
613 1
  public function xml()
614
  {
615 1
    $xml = $this->document->saveXML(null, LIBXML_NOEMPTYTAG);
616
617
    // remove the XML-header
618 1
    $xml = ltrim(preg_replace('/<\?xml.*\?>/', '', $xml));
619
620 1
    return $this->fixHtmlOutput($xml);
621
  }
622
623
  /**
624
   * Get dom node's inner html
625
   *
626
   * @return string
627
   */
628 14
  public function innerHtml()
629
  {
630 14
    $text = '';
631
632 14
    foreach ($this->document->documentElement->childNodes as $node) {
633 14
      $text .= $this->fixHtmlOutput($this->document->saveHTML($node));
634 14
    }
635
636 14
    return $text;
637
  }
638
639
  /**
640
   * Load HTML from string
641
   *
642
   * @param string   $html
643
   * @param int|null $libXMLExtraOptions
644
   *
645
   * @return HtmlDomParser
646
   *
647
   * @throws InvalidArgumentException if argument is not string
648
   */
649 109
  public function loadHtml($html, $libXMLExtraOptions = null)
650
  {
651 109
    if (!is_string($html)) {
652 3
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
653
    }
654
655 106
    $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
656
657 106
    return $this;
658
  }
659
660
  /**
661
   * Load HTML from file
662
   *
663
   * @param string   $filePath
664
   * @param int|null $libXMLExtraOptions
665
   *
666
   * @return HtmlDomParser
667
   */
668 12
  public function loadHtmlFile($filePath, $libXMLExtraOptions = null)
669
  {
670 12
    if (!is_string($filePath)) {
671 2
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
672
    }
673
674 10
    if (!preg_match("/^https?:\/\//i", $filePath) && !file_exists($filePath)) {
675 1
      throw new RuntimeException("File $filePath not found");
676
    }
677
678
    try {
679 9
      $html = UTF8::file_get_contents($filePath);
680
681 9
    } catch (\Exception $e) {
682 1
      throw new RuntimeException("Could not load file $filePath");
683
    }
684
685 8
    if ($html === false) {
686
      throw new RuntimeException("Could not load file $filePath");
687
    }
688
689 8
    $this->loadHtml($html, $libXMLExtraOptions);
690
691 8
    return $this;
692
  }
693
694
  /**
695
   * Save dom as string
696
   *
697
   * @param string $filepath
698
   *
699
   * @return string
700
   */
701 1
  public function save($filepath = '')
702
  {
703 1
    $string = $this->innerHtml();
704 1
    if ($filepath !== '') {
705
      file_put_contents($filepath, $string, LOCK_EX);
706
    }
707
708 1
    return $string;
709
  }
710
711
  /**
712
   * @param $functionName
713
   */
714
  public function set_callback($functionName)
715
  {
716
    $this::$callback = $functionName;
717
  }
718
719
  /**
720
   * Get dom node's plain text
721
   *
722
   * @return string
723
   */
724 2
  public function text()
725
  {
726 2
    return $this->fixHtmlOutput($this->document->textContent);
727
  }
728
}
729