Completed
Push — master ( a85a91...44c366 )
by Lars
02:35
created

HtmlDomParser::html()   A

Complexity

Conditions 3
Paths 4

Size

Total Lines 14
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 7
CRAP Score 3.0987

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 14
ccs 7
cts 9
cp 0.7778
rs 9.4285
cc 3
eloc 8
nc 4
nop 0
crap 3.0987
1
<?php
2
3
namespace voku\helper;
4
5
use BadMethodCallException;
6
use DOMDocument;
7
use DOMXPath;
8
use InvalidArgumentException;
9
use RuntimeException;
10
11
/**
12
 * Class HtmlDomParser
13
 *
14
 * @package voku\helper
15
 *
16
 * @property string      outertext Get dom node's outer html
17
 * @property string      innertext Get dom node's inner html
18
 * @property-read string plaintext Get dom node's plain text
19
 *
20
 * @method string outertext() Get dom node's outer html
21
 * @method string innertext() Get dom node's inner html
22
 * @method HtmlDomParser load() load($html) Load HTML from string
23
 * @method HtmlDomParser load_file() load_file($html) Load HTML from file
24
 *
25
 * @method static HtmlDomParser file_get_html() file_get_html($html) Load HTML from file
26
 * @method static HtmlDomParser str_get_html() str_get_html($html) Load HTML from string
27
 */
28
class HtmlDomParser
29
{
30
  /**
31
   * @var array
32
   */
33
  protected static $functionAliases = array(
34
      'outertext' => 'html',
35
      'innertext' => 'innerHtml',
36
      'load'      => 'loadHtml',
37
      'load_file' => 'loadHtmlFile',
38
  );
39
40
  /**
41
   * @var array
42
   */
43
  private static $domLinkReplaceHelper = array(
44
      'orig' => array('[', ']', '{', '}',),
45
      'tmp'  => array(
46
          '!!!!HTML_DOM__SQUARE_BRACKET_LEFT!!!!',
47
          '!!!!HTML_DOM__SQUARE_BRACKET_RIGHT!!!!',
48
          '!!!!HTML_DOM__BRACKET_LEFT!!!!',
49
          '!!!!HTML_DOM__BRACKET_RIGHT!!!!',
50
      ),
51
  );
52
53
  /**
54
   * @var array
55
   */
56
  protected static $domReplaceHelper = array(
57
      'orig' => array('&', '|'),
58
      'tmp'  => array('!!!!HTML_DOM__AMP!!!!', '!!!!HTML_DOM__PIPE!!!!'),
59
  );
60
61
  /**
62
   * @var Callable
63
   */
64
  protected static $callback;
65
66
  /**
67
   * @var DOMDocument
68
   */
69
  protected $document;
70
71
  /**
72
   * @var string
73
   */
74
  protected $encoding = 'UTF-8';
75
76
  /**
77
   * @var bool
78
   */
79
  protected $isDOMDocumentCreatedWithoutHtml = false;
80
81
  /**
82
   * @var bool
83
   */
84
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
85
86
  /**
87
   * Constructor
88
   *
89
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
90
   */
91 85
  public function __construct($element = null)
92
  {
93 85
    $this->document = new \DOMDocument('1.0', $this->getEncoding());
94
95
    // DOMDocument settings
96 85
    $this->document->preserveWhiteSpace = false;
97 85
    $this->document->formatOutput = true;
98
99 85
    if ($element instanceof SimpleHtmlDom) {
100 40
      $element = $element->getNode();
101 40
    }
102
103 85
    if ($element instanceof \DOMNode) {
104 40
      $domNode = $this->document->importNode($element, true);
105
106 40
      if ($domNode instanceof \DOMNode) {
107 40
        $this->document->appendChild($domNode);
108 40
      }
109
110 40
      return;
111
    }
112
113 85
    if ($element !== null) {
114 69
      $this->loadHtml($element);
115 68
    }
116 84
  }
117
118
  /**
119
   * @param $name
120
   * @param $arguments
121
   *
122
   * @return bool|mixed
123
   */
124 6 View Code Duplication
  public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
125
  {
126 6
    if (isset(self::$functionAliases[$name])) {
127 5
      return call_user_func_array(array($this, self::$functionAliases[$name]), $arguments);
128
    }
129
130 1
    throw new BadMethodCallException('Method does not exist: ' . $name);
131
  }
132
133
  /**
134
   * @param $name
135
   * @param $arguments
136
   *
137
   * @return HtmlDomParser
138
   */
139 11
  public static function __callStatic($name, $arguments)
140
  {
141 11
    if ($name == 'str_get_html') {
142 7
      $parser = new self();
143
144 7
      return $parser->loadHtml($arguments[0]);
145
    }
146
147 4
    if ($name == 'file_get_html') {
148 3
      $parser = new self();
149
150 3
      return $parser->loadHtmlFile($arguments[0]);
151
    }
152
153 1
    throw new BadMethodCallException('Method does not exist');
154
  }
155
156
  /**
157
   * @param $name
158
   *
159
   * @return string
160
   */
161 10 View Code Duplication
  public function __get($name)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
162
  {
163
    switch ($name) {
164 10
      case 'outertext':
165 7
        return $this->html();
166 3
      case 'innertext':
167 1
        return $this->innerHtml();
168 2
      case 'plaintext':
169 1
        return $this->text();
170
    }
171
172 1
    return null;
173
  }
174
175
  /**
176
   * @param string $selector
177
   * @param int    $idx
178
   *
179
   * @return SimpleHtmlDom|SimpleHtmlDomNode|null
180
   */
181 2
  public function __invoke($selector, $idx = null)
182
  {
183 2
    return $this->find($selector, $idx);
184
  }
185
186
  /**
187
   * @return string
188
   */
189 8
  public function __toString()
190
  {
191 8
    return $this->html();
192
  }
193
194
  /**
195
   * does nothing (only for api-compatibility-reasons)
196
   *
197
   * @return bool
198
   */
199 1
  public function clear()
200
  {
201 1
    return true;
202
  }
203
204
  /**
205
   * @param string $html
206
   *
207
   * @return string
208
   */
209 49
  private function replaceToPreserveHtmlEntities($html)
210
  {
211 49
    preg_match_all("/(\bhttps?:\/\/[^\s()<>]+(?:\([\w\d]+\)|[^[:punct:]\s]|\/|\}|\]))/i", $html, $linksOld);
212
213 49
    $linksNew = array();
214 49
    if (!empty($linksOld[1])) {
215 45
      $linksOld = $linksOld[1];
216 45
      foreach ($linksOld as $linkKey => $linkOld) {
217 45
        $linksNew[$linkKey] = str_replace(
218 45
            self::$domLinkReplaceHelper['orig'],
219 45
            self::$domLinkReplaceHelper['tmp'],
220
            $linkOld
221 45
        );
222 45
      }
223 45
    }
224
225 49
    $linksNewCount = count($linksNew);
226 49
    if ($linksNewCount > 0 && count($linksOld) === $linksNewCount) {
227 45
      $search = array_merge($linksOld, self::$domReplaceHelper['orig']);
228 45
      $replace = array_merge($linksNew, self::$domReplaceHelper['tmp']);
229 45
    } else {
230 4
      $search = self::$domReplaceHelper['orig'];
231 4
      $replace = self::$domReplaceHelper['tmp'];
232
    }
233
234 49
    return str_replace($search, $replace, $html);
235
  }
236
237
  /**
238
   * @param string $html
239
   *
240
   * @return string
241
   */
242 22
  private function putReplacedBackToPreserveHtmlEntities($html)
243
  {
244 22
    return str_replace(
245 22
        array_merge(
246 22
            self::$domLinkReplaceHelper['tmp'],
247 22
            self::$domReplaceHelper['tmp'],
248 22
            array('&#13;')
249 22
        ),
250 22
        array_merge(
251 22
            self::$domLinkReplaceHelper['orig'],
252 22
            self::$domReplaceHelper['orig'],
253 22
            array('')
254 22
        ),
255
        $html
256 22
    );
257
  }
258
259
  /**
260
   * create DOMDocument from HTML
261
   *
262
   * @param string $html
263
   *
264
   * @return \DOMDocument
265
   */
266 74
  private function createDOMDocument($html)
267
  {
268 74
    if (strpos($html, '<') === false) {
269 3
      $this->isDOMDocumentCreatedWithoutHtml = true;
270 3
    }
271
272 74
    if (strpos($html, '<html') === false) {
273 29
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
274 29
    }
275
276
    // set error level
277 74
    $internalErrors = libxml_use_internal_errors(true);
278 74
    $disableEntityLoader = libxml_disable_entity_loader(true);
279 74
    libxml_clear_errors();
280
281 74
    $options = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
282 74
    if (defined(LIBXML_COMPACT)) {
283
      $options |= LIBXML_COMPACT;
284
    }
285
286 74
    $sxe = simplexml_load_string($html, 'SimpleXMLElement', $options);
287 74
    if ($sxe !== false && count(libxml_get_errors()) === 0) {
288 27
      $this->document = dom_import_simplexml($sxe)->ownerDocument;
289 27
    } else {
290
291
      // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
292 49
      $html = trim($html);
293 49
      $xmlHackUsed = false;
294 49
      if (stripos('<?xml', $html) !== 0) {
295 49
        $xmlHackUsed = true;
296 49
        $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
297 49
      }
298
299 49
      $html = $this->replaceToPreserveHtmlEntities($html);
300
301 49
      $this->document->loadHTML($html);
302
303
      // remove the "xml-encoding" hack
304 49
      if ($xmlHackUsed === true) {
305 49
        foreach ($this->document->childNodes as $child) {
306 49
          if ($child->nodeType == XML_PI_NODE) {
307 49
            $this->document->removeChild($child);
308 49
          }
309 49
        }
310 49
      }
311
312 49
      libxml_clear_errors();
313
    }
314
315
    // set encoding
316 74
    $this->document->encoding = $this->getEncoding();
317
318
    // restore lib-xml settings
319 74
    libxml_use_internal_errors($internalErrors);
320 74
    libxml_disable_entity_loader($disableEntityLoader);
321
322 74
    return $this->document;
323
  }
324
325
  /**
326
   * Return SimpleHtmlDom by id.
327
   *
328
   * @param string $id
329
   *
330
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
331
   */
332 1
  public function getElementById($id)
333
  {
334 1
    return $this->find("#$id", 0);
335
  }
336
337
  /**
338
   * Return SimpleHtmlDom by tag name.
339
   *
340
   * @param string $name
341
   *
342
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
343
   */
344 1
  public function getElementByTagName($name)
345
  {
346 1
    $node = $this->document->getElementsByTagName($name)->item(0);
347
348 1
    if ($node !== null) {
349 1
      return new SimpleHtmlDom($node);
350
    } else {
351
      return new SimpleHtmlDomNodeBlank();
352
    }
353
  }
354
355
  /**
356
   * Returns Elements by id
357
   *
358
   * @param string   $id
359
   * @param null|int $idx
360
   *
361
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
362
   */
363
  public function getElementsById($id, $idx = null)
364
  {
365
    return $this->find("#$id", $idx);
366
  }
367
368
  /**
369
   * Returns Elements by tag name
370
   *
371
   * @param string   $name
372
   * @param null|int $idx
373
   *
374
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
375
   */
376 1 View Code Duplication
  public function getElementsByTagName($name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
377
  {
378 1
    $nodesList = $this->document->getElementsByTagName($name);
379
380 1
    $elements = new SimpleHtmlDomNode();
381
382 1
    foreach ($nodesList as $node) {
383 1
      $elements[] = new SimpleHtmlDom($node);
384 1
    }
385
386 1
    if (null === $idx) {
387
      return $elements;
388
    } else {
389 1
      if ($idx < 0) {
390
        $idx = count($elements) + $idx;
391
      }
392
    }
393
394 1
    if (isset($elements[$idx])) {
395 1
      return $elements[$idx];
396
    } else {
397
      return new SimpleHtmlDomNodeBlank();
398
    }
399
  }
400
401
  /**
402
   * Find list of nodes with a CSS selector.
403
   *
404
   * @param string $selector
405
   * @param int    $idx
406
   *
407
   * @return SimpleHtmlDom|SimpleHtmlDom[]
408
   */
409 53
  public function find($selector, $idx = null)
410
  {
411 53
    $xPathQuery = SelectorConverter::toXPath($selector);
412
413 53
    $xPath = new DOMXPath($this->document);
414 53
    $nodesList = $xPath->query($xPathQuery);
415 53
    $elements = new SimpleHtmlDomNode();
416
417 53
    foreach ($nodesList as $node) {
418 51
      $elements[] = new SimpleHtmlDom($node);
419 53
    }
420
421 53
    if (null === $idx) {
422 45
      return $elements;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $elements; (voku\helper\SimpleHtmlDomNode) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\helper\SimpleHtmlDom[].

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
423
    } else {
424 20
      if ($idx < 0) {
425 11
        $idx = count($elements) + $idx;
426 11
      }
427
    }
428
429 20
    if (isset($elements[$idx])) {
430 20
      return $elements[$idx];
431
    } else {
432
      return new SimpleHtmlDomNodeBlank();
0 ignored issues
show
Bug Best Practice introduced by
The return type of return new \voku\helper\SimpleHtmlDomNodeBlank(); (voku\helper\SimpleHtmlDomNodeBlank) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\helper\SimpleHtmlDom[].

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
433
    }
434
  }
435
436
  /**
437
   * @param string $content
438
   *
439
   * @return string
440
   */
441 22
  protected function fixHtmlOutput($content)
442
  {
443
    // INFO: DOMDocument will encapsulate plaintext into a paragraph tag (<p>),
444
    //          so we try to remove it here again ...
445
446 22
    if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
447 8
      $content = str_replace(
448
          array(
449 8
              "\n",
450 8
              "\r\n",
451 8
              "\r",
452 8
              '<simpleHtmlDomP>',
453 8
              '</simpleHtmlDomP>',
454 8
              '<body>',
455 8
              '</body>',
456 8
              '<html>',
457 8
              '</html>',
458 8
          ),
459 8
          '',
460
          $content
461 8
      );
462 8
    }
463
464 22
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
465 3
      $content = str_replace(array('<p>', '</p>'), '', $content);
466 3
    }
467
468 22
    $content = UTF8::html_entity_decode($content);
469 22
    $content = trim($content);
470 22
    $content = UTF8::urldecode($content);
471
472 22
    $content = $this->putReplacedBackToPreserveHtmlEntities($content);
473
474 22
    return $content;
475
  }
476
477
  /**
478
   * @return DOMDocument
479
   */
480 35
  public function getDocument()
481
  {
482 35
    return $this->document;
483
  }
484
485
  /**
486
   * Get the encoding to use
487
   *
488
   * @return string
489
   */
490 85
  private function getEncoding()
491
  {
492 85
    return $this->encoding;
493
  }
494
495
  /**
496
   * @return bool
497
   */
498 6
  public function getIsDOMDocumentCreatedWithoutHtml()
499
  {
500 6
    return $this->isDOMDocumentCreatedWithoutHtml;
501
  }
502
503
  /**
504
   * @return bool
505
   */
506 19
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper()
507
  {
508 19
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
509
  }
510
511
  /**
512
   * Get dom node's outer html
513
   *
514
   * @return string
515
   */
516 19
  public function html()
517
  {
518 19
    if ($this::$callback !== null) {
519
      call_user_func_array($this::$callback, array($this));
520
    }
521
522 19
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
523 7
      $content = $this->document->saveHTML($this->document->documentElement);
524 7
    } else {
525 16
      $content = $this->document->saveHTML();
526
    }
527
528 19
    return $this->fixHtmlOutput($content);
529
  }
530
531
  /**
532
   * Get the HTML as XML.
533
   *
534
   * @return string
535
   */
536 1
  public function xml()
537
  {
538 1
    $xml = $this->document->saveXML(null, LIBXML_NOEMPTYTAG);
539
540
    // remove the XML-header
541 1
    $xml = ltrim(preg_replace('/<\?xml.*\?>/', '', $xml));
542
543 1
    return $this->fixHtmlOutput($xml);
544
  }
545
546
  /**
547
   * Get dom node's inner html
548
   *
549
   * @return string
550
   */
551 5
  public function innerHtml()
552
  {
553 5
    $text = '';
554
555 5
    foreach ($this->document->documentElement->childNodes as $node) {
556 5
      $text .= $this->fixHtmlOutput($this->document->saveHTML($node));
557 5
    }
558
559 5
    return $text;
560
  }
561
562
  /**
563
   * Load HTML from string
564
   *
565
   * @param string $html
566
   *
567
   * @return HtmlDomParser
568
   *
569
   * @throws InvalidArgumentException if argument is not string
570
   */
571 77
  public function loadHtml($html)
572
  {
573 77
    if (!is_string($html)) {
574 3
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
575
    }
576
577 74
    $this->document = $this->createDOMDocument($html);
578
579 74
    return $this;
580
  }
581
582
  /**
583
   * Load HTML from file
584
   *
585
   * @param string $filePath
586
   *
587
   * @return HtmlDomParser
588
   */
589 7
  public function loadHtmlFile($filePath)
590
  {
591 7
    if (!is_string($filePath)) {
592 2
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
593
    }
594
595 5
    if (!preg_match("/^https?:\/\//i", $filePath) && !file_exists($filePath)) {
596 1
      throw new RuntimeException("File $filePath not found");
597
    }
598
599
    try {
600 4
      $html = file_get_contents($filePath);
601
602 4
    } catch (\Exception $e) {
603 1
      throw new RuntimeException("Could not load file $filePath");
604
    }
605
606 3
    if ($html === false) {
607
      throw new RuntimeException("Could not load file $filePath");
608
    }
609
610 3
    $this->loadHtml($html);
611
612 3
    return $this;
613
  }
614
615
  /**
616
   * Save dom as string
617
   *
618
   * @param string $filepath
619
   *
620
   * @return string
621
   */
622 1
  public function save($filepath = '')
623
  {
624 1
    $string = $this->innerHtml();
625 1
    if ($filepath !== '') {
626
      file_put_contents($filepath, $string, LOCK_EX);
627
    }
628
629 1
    return $string;
630
  }
631
632
  /**
633
   * @param $functionName
634
   */
635
  public function set_callback($functionName)
636
  {
637
    $this::$callback = $functionName;
638
  }
639
640
  /**
641
   * Get dom node's plain text
642
   *
643
   * @return string
644
   */
645 1
  public function text()
646
  {
647 1
    return $this->document->textContent;
648
  }
649
}
650