Completed
Push — master ( cb4c73...cb8736 )
by Lars
02:58
created

HtmlDomParser::loadHtml()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 10
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 2

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 10
ccs 5
cts 5
cp 1
rs 9.4285
cc 2
eloc 5
nc 2
nop 1
crap 2
1
<?php
2
3
namespace voku\helper;
4
5
use BadMethodCallException;
6
use DOMDocument;
7
use DOMXPath;
8
use InvalidArgumentException;
9
use RuntimeException;
10
11
/**
12
 * Class HtmlDomParser
13
 *
14
 * @package voku\helper
15
 *
16
 * @property string      outertext Get dom node's outer html
17
 * @property string      innertext Get dom node's inner html
18
 * @property-read string plaintext Get dom node's plain text
19
 *
20
 * @method string outertext() Get dom node's outer html
21
 * @method string innertext() Get dom node's inner html
22
 * @method HtmlDomParser load() load($html) Load HTML from string
23
 * @method HtmlDomParser load_file() load_file($html) Load HTML from file
24
 *
25
 * @method static HtmlDomParser file_get_html() file_get_html($html) Load HTML from file
26
 * @method static HtmlDomParser str_get_html() str_get_html($html) Load HTML from string
27
 */
28
class HtmlDomParser
29
{
30
  /**
31
   * @var array
32
   */
33
  protected static $functionAliases = array(
34
      'outertext' => 'html',
35
      'innertext' => 'innerHtml',
36
      'load'      => 'loadHtml',
37
      'load_file' => 'loadHtmlFile',
38
  );
39
40
  /**
41
   * @var array
42
   */
43
  private static $domLinkReplaceHelper = array(
44
      'orig' => array('[', ']', '{', '}',),
45
      'tmp'  => array(
46
          '!!!!HTML_DOM__SQUARE_BRACKET_LEFT!!!!',
47
          '!!!!HTML_DOM__SQUARE_BRACKET_RIGHT!!!!',
48
          '!!!!HTML_DOM__BRACKET_LEFT!!!!',
49
          '!!!!HTML_DOM__BRACKET_RIGHT!!!!',
50
      ),
51
  );
52
53
  /**
54
   * @var array
55
   */
56
  protected static $domReplaceHelper = array(
57
      'orig' => array('&', '|'),
58
      'tmp'  => array('!!!!HTML_DOM__AMP!!!!', '!!!!HTML_DOM__PIPE!!!!'),
59
  );
60
61
  /**
62
   * @var Callable
63
   */
64
  protected static $callback;
65
66
  /**
67
   * @var DOMDocument
68
   */
69
  protected $document;
70
71
  /**
72
   * @var string
73
   */
74
  protected $encoding = 'UTF-8';
75
76
  /**
77
   * @var bool
78
   */
79
  protected $isDOMDocumentCreatedWithoutHtml = false;
80
81
  /**
82
   * @var bool
83
   */
84
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
85
86
  /**
87
   * Constructor
88
   *
89
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
90
   */
91 85
  public function __construct($element = null)
92
  {
93 85
    $this->document = new \DOMDocument('1.0', $this->getEncoding());
94
95
    // DOMDocument settings
96 85
    $this->document->preserveWhiteSpace = false;
97 85
    $this->document->formatOutput = true;
98
99 85
    if ($element instanceof SimpleHtmlDom) {
100 40
      $element = $element->getNode();
101 40
    }
102
103 85
    if ($element instanceof \DOMNode) {
104 40
      $domNode = $this->document->importNode($element, true);
105
106 40
      if ($domNode instanceof \DOMNode) {
107 40
        $this->document->appendChild($domNode);
108 40
      }
109
110 40
      return;
111
    }
112
113 85
    if ($element !== null) {
114 69
      $this->loadHtml($element);
115 68
    }
116 84
  }
117
118
  /**
119
   * @param $name
120
   * @param $arguments
121
   *
122
   * @return bool|mixed
123
   */
124 6 View Code Duplication
  public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
125
  {
126 6
    if (isset(self::$functionAliases[$name])) {
127 5
      return call_user_func_array(array($this, self::$functionAliases[$name]), $arguments);
128
    }
129
130 1
    throw new BadMethodCallException('Method does not exist: ' . $name);
131
  }
132
133
  /**
134
   * @param $name
135
   * @param $arguments
136
   *
137
   * @return HtmlDomParser
138
   */
139 11
  public static function __callStatic($name, $arguments)
140
  {
141 11
    if ($name == 'str_get_html') {
142 7
      $parser = new self();
143
144 7
      return $parser->loadHtml($arguments[0]);
145
    }
146
147 4
    if ($name == 'file_get_html') {
148 3
      $parser = new self();
149
150 3
      return $parser->loadHtmlFile($arguments[0]);
151
    }
152
153 1
    throw new BadMethodCallException('Method does not exist');
154
  }
155
156
  /**
157
   * @param $name
158
   *
159
   * @return string
160
   */
161 10 View Code Duplication
  public function __get($name)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
162
  {
163
    switch ($name) {
164 10
      case 'outertext':
165 7
        return $this->html();
166 3
      case 'innertext':
167 1
        return $this->innerHtml();
168 2
      case 'plaintext':
169 1
        return $this->text();
170
    }
171
172 1
    return null;
173
  }
174
175
  /**
176
   * @param string $selector
177
   * @param int    $idx
178
   *
179
   * @return SimpleHtmlDom|SimpleHtmlDomNode|null
180
   */
181 2
  public function __invoke($selector, $idx = null)
182
  {
183 2
    return $this->find($selector, $idx);
184
  }
185
186
  /**
187
   * @return string
188
   */
189 8
  public function __toString()
190
  {
191 8
    return $this->html();
192
  }
193
194
  /**
195
   * does nothing (only for api-compatibility-reasons)
196
   *
197
   * @return bool
198
   */
199 1
  public function clear()
200
  {
201 1
    return true;
202
  }
203
204
  /**
205
   * @param string $html
206
   *
207
   * @return string
208
   */
209 49
  private function replaceToPreserveHtmlEntities($html)
210
  {
211 49
    preg_match_all("/(\bhttps?:\/\/[^\s()<>]+(?:\([\w\d]+\)|[^[:punct:]\s]|\/|\}|\]))/i", $html, $linksOld);
212
213 49
    $linksNew = array();
214 49
    if (!empty($linksOld[1])) {
215 45
      $linksOld = $linksOld[1];
216 45
      foreach ($linksOld as $linkKey => $linkOld) {
217 45
        $linksNew[$linkKey] = str_replace(
218 45
            self::$domLinkReplaceHelper['orig'],
219 45
            self::$domLinkReplaceHelper['tmp'],
220
            $linkOld
221 45
        );
222 45
      }
223 45
    }
224
225 49
    $linksNewCount = count($linksNew);
226 49
    if ($linksNewCount > 0 && count($linksOld) === $linksNewCount) {
227 45
      $search = array_merge($linksOld, self::$domReplaceHelper['orig']);
228 45
      $replace = array_merge($linksNew, self::$domReplaceHelper['tmp']);
229 45
    } else {
230 4
      $search = self::$domReplaceHelper['orig'];
231 4
      $replace = self::$domReplaceHelper['tmp'];
232
    }
233
234 49
    return str_replace($search, $replace, $html);
235
  }
236
237
  /**
238
   * @param string $html
239
   *
240
   * @return string
241
   */
242 22
  private function putReplacedBackToPreserveHtmlEntities($html)
243
  {
244 22
    return str_replace(
245 22
        array_merge(
246 22
            self::$domLinkReplaceHelper['tmp'],
247 22
            self::$domReplaceHelper['tmp'],
248 22
            array('&#13;')
249 22
        ),
250 22
        array_merge(
251 22
            self::$domLinkReplaceHelper['orig'],
252 22
            self::$domReplaceHelper['orig'],
253 22
            array('')
254 22
        ),
255
        $html
256 22
    );
257
  }
258
259
  /**
260
   * create DOMDocument from HTML
261
   *
262
   * @param string $html
263
   *
264
   * @return \DOMDocument
265
   */
266 74
  private function createDOMDocument($html)
267
  {
268 74
    if (strpos($html, '<') === false) {
269 3
      $this->isDOMDocumentCreatedWithoutHtml = true;
270 3
    }
271
272 74
    if (strpos($html, '<html') === false) {
273 29
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
274 29
    }
275
276
    // set error level
277 74
    $internalErrors = libxml_use_internal_errors(true);
278 74
    $disableEntityLoader = libxml_disable_entity_loader(true);
279 74
    libxml_clear_errors();
280
281 74
    $options = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
282 74
    if (defined(LIBXML_COMPACT)) {
283
      $options |= LIBXML_COMPACT;
284
    }
285
286 74
    $sxe = simplexml_load_string($html, 'SimpleXMLElement', $options);
287 74
    if ($sxe !== false && count(libxml_get_errors()) === 0) {
288 27
      $this->document = dom_import_simplexml($sxe)->ownerDocument;
289 27
    } else {
290
291 49
      $xmlHackUsed = false;
292 49
      if (stripos('<?xml', $html) !== 0) {
293 49
        $xmlHackUsed = true;
294 49
        $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
295 49
      }
296
297 49
      $html = $this->replaceToPreserveHtmlEntities($html);
298
299 49
      $this->document->loadHTML($html);
300
301
      // remove the "xml-encoding" hack
302 49
      if ($xmlHackUsed === true) {
303 49
        foreach ($this->document->childNodes as $child) {
304 49
          if ($child->nodeType == XML_PI_NODE) {
305 49
            $this->document->removeChild($child);
306 49
          }
307 49
        }
308 49
      }
309
310 49
      libxml_clear_errors();
311
    }
312
313
    // set encoding
314 74
    $this->document->encoding = $this->getEncoding();
315
316
    // restore lib-xml settings
317 74
    libxml_use_internal_errors($internalErrors);
318 74
    libxml_disable_entity_loader($disableEntityLoader);
319
320 74
    return $this->document;
321
  }
322
323
  /**
324
   * Return SimpleHtmlDom by id.
325
   *
326
   * @param string $id
327
   *
328
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
329
   */
330 1
  public function getElementById($id)
331
  {
332 1
    return $this->find("#$id", 0);
333
  }
334
335
  /**
336
   * Return SimpleHtmlDom by tag name.
337
   *
338
   * @param string $name
339
   *
340
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
341
   */
342 1
  public function getElementByTagName($name)
343
  {
344 1
    $node = $this->document->getElementsByTagName($name)->item(0);
345
346 1
    if ($node !== null) {
347 1
      return new SimpleHtmlDom($node);
348
    } else {
349
      return new SimpleHtmlDomNodeBlank();
350
    }
351
  }
352
353
  /**
354
   * Returns Elements by id
355
   *
356
   * @param string   $id
357
   * @param null|int $idx
358
   *
359
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
360
   */
361
  public function getElementsById($id, $idx = null)
362
  {
363
    return $this->find("#$id", $idx);
364
  }
365
366
  /**
367
   * Returns Elements by tag name
368
   *
369
   * @param string   $name
370
   * @param null|int $idx
371
   *
372
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
373
   */
374 1 View Code Duplication
  public function getElementsByTagName($name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
375
  {
376 1
    $nodesList = $this->document->getElementsByTagName($name);
377
378 1
    $elements = new SimpleHtmlDomNode();
379
380 1
    foreach ($nodesList as $node) {
381 1
      $elements[] = new SimpleHtmlDom($node);
382 1
    }
383
384 1
    if (null === $idx) {
385
      return $elements;
386
    } else {
387 1
      if ($idx < 0) {
388
        $idx = count($elements) + $idx;
389
      }
390
    }
391
392 1
    if (isset($elements[$idx])) {
393 1
      return $elements[$idx];
394
    } else {
395
      return new SimpleHtmlDomNodeBlank();
396
    }
397
  }
398
399
  /**
400
   * Find list of nodes with a CSS selector.
401
   *
402
   * @param string $selector
403
   * @param int    $idx
404
   *
405
   * @return SimpleHtmlDom|SimpleHtmlDom[]
406
   */
407 53
  public function find($selector, $idx = null)
408
  {
409 53
    $xPathQuery = SelectorConverter::toXPath($selector);
410
411 53
    $xPath = new DOMXPath($this->document);
412 53
    $nodesList = $xPath->query($xPathQuery);
413 53
    $elements = new SimpleHtmlDomNode();
414
415 53
    foreach ($nodesList as $node) {
416 51
      $elements[] = new SimpleHtmlDom($node);
417 53
    }
418
419 53
    if (null === $idx) {
420 45
      return $elements;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $elements; (voku\helper\SimpleHtmlDomNode) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\helper\SimpleHtmlDom[].

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
421
    } else {
422 20
      if ($idx < 0) {
423 11
        $idx = count($elements) + $idx;
424 11
      }
425
    }
426
427 20
    if (isset($elements[$idx])) {
428 20
      return $elements[$idx];
429
    } else {
430
      return new SimpleHtmlDomNodeBlank();
0 ignored issues
show
Bug Best Practice introduced by
The return type of return new \voku\helper\SimpleHtmlDomNodeBlank(); (voku\helper\SimpleHtmlDomNodeBlank) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\helper\SimpleHtmlDom[].

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
431
    }
432
  }
433
434
  /**
435
   * @param string $content
436
   *
437
   * @return string
438
   */
439 22
  protected function fixHtmlOutput($content)
440
  {
441
    // INFO: DOMDocument will encapsulate plaintext into a paragraph tag (<p>),
442
    //          so we try to remove it here again ...
443
444 22
    if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
445 8
      $content = str_replace(
446
          array(
447 8
              "\n",
448 8
              "\r\n",
449 8
              "\r",
450 8
              '<simpleHtmlDomP>',
451 8
              '</simpleHtmlDomP>',
452 8
              '<body>',
453 8
              '</body>',
454 8
              '<html>',
455 8
              '</html>',
456 8
          ),
457 8
          '',
458
          $content
459 8
      );
460 8
    }
461
462 22
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
463 3
      $content = str_replace(array('<p>', '</p>'), '', $content);
464 3
    }
465
466 22
    $content = UTF8::html_entity_decode($content);
467 22
    $content = UTF8::trim($content);
468 22
    $content = UTF8::urldecode($content);
469
470 22
    $content = $this->putReplacedBackToPreserveHtmlEntities($content);
471
472 22
    return $content;
473
  }
474
475
  /**
476
   * @return DOMDocument
477
   */
478 35
  public function getDocument()
479
  {
480 35
    return $this->document;
481
  }
482
483
  /**
484
   * Get the encoding to use
485
   *
486
   * @return string
487
   */
488 85
  private function getEncoding()
489
  {
490 85
    return $this->encoding;
491
  }
492
493
  /**
494
   * @return bool
495
   */
496 6
  public function getIsDOMDocumentCreatedWithoutHtml()
497
  {
498 6
    return $this->isDOMDocumentCreatedWithoutHtml;
499
  }
500
501
  /**
502
   * @return bool
503
   */
504 19
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper()
505
  {
506 19
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
507
  }
508
509
  /**
510
   * Get dom node's outer html
511
   *
512
   * @return string
513
   */
514 19
  public function html()
515
  {
516 19
    if ($this::$callback !== null) {
517
      call_user_func_array($this::$callback, array($this));
518
    }
519
520 19
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
521 7
      $content = $this->document->saveHTML($this->document->documentElement);
522 7
    } else {
523 16
      $content = $this->document->saveHTML();
524
    }
525
526 19
    return $this->fixHtmlOutput($content);
527
  }
528
529
  /**
530
   * Get dom node's inner html
531
   *
532
   * @return string
533
   */
534 5
  public function innerHtml()
535
  {
536 5
    $text = '';
537
538 5
    foreach ($this->document->documentElement->childNodes as $node) {
539 5
      $text .= $this->fixHtmlOutput($this->document->saveHTML($node));
540 5
    }
541
542 5
    return $text;
543
  }
544
545
  /**
546
   * Load HTML from string
547
   *
548
   * @param string $html
549
   *
550
   * @return HtmlDomParser
551
   *
552
   * @throws InvalidArgumentException if argument is not string
553
   */
554 77
  public function loadHtml($html)
555
  {
556 77
    if (!is_string($html)) {
557 3
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
558
    }
559
560 74
    $this->document = $this->createDOMDocument($html);
561
562 74
    return $this;
563
  }
564
565
  /**
566
   * Load HTML from file
567
   *
568
   * @param string $filePath
569
   *
570
   * @return HtmlDomParser
571
   */
572 7
  public function loadHtmlFile($filePath)
573
  {
574 7
    if (!is_string($filePath)) {
575 2
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
576
    }
577
578 5
    if (!preg_match("/^https?:\/\//i", $filePath) && !file_exists($filePath)) {
579 1
      throw new RuntimeException("File $filePath not found");
580
    }
581
582
    try {
583 4
      $html = file_get_contents($filePath);
584
585 4
    } catch (\Exception $e) {
586 1
      throw new RuntimeException("Could not load file $filePath");
587
    }
588
589 3
    if ($html === false) {
590
      throw new RuntimeException("Could not load file $filePath");
591
    }
592
593 3
    $this->loadHtml($html);
594
595 3
    return $this;
596
  }
597
598
  /**
599
   * Save dom as string
600
   *
601
   * @param string $filepath
602
   *
603
   * @return string
604
   */
605 1
  public function save($filepath = '')
606
  {
607 1
    $string = $this->innerHtml();
608 1
    if ($filepath !== '') {
609
      file_put_contents($filepath, $string, LOCK_EX);
610
    }
611
612 1
    return $string;
613
  }
614
615
  /**
616
   * @param $functionName
617
   */
618
  public function set_callback($functionName)
619
  {
620
    $this::$callback = $functionName;
621
  }
622
623
  /**
624
   * Get dom node's plain text
625
   *
626
   * @return string
627
   */
628 1
  public function text()
629
  {
630 1
    return $this->document->textContent;
631
  }
632
}
633