Completed
Push — master ( 6b0cbf...ef4cd6 )
by Lars
02:56
created

HtmlDomParser::innerHtml()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 10
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 2

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 10
ccs 5
cts 5
cp 1
rs 9.4285
cc 2
eloc 5
nc 2
nop 0
crap 2
1
<?php
2
3
namespace voku\helper;
4
5
use BadMethodCallException;
6
use DOMDocument;
7
use DOMXPath;
8
use InvalidArgumentException;
9
use RuntimeException;
10
11
/**
12
 * Class HtmlDomParser
13
 *
14
 * @package voku\helper
15
 *
16
 * @property-read string outerText Get dom node's outer html (alias for "outerHtml")
17
 * @property-read string outerHtml Get dom node's outer html
18
 * @property-read string innerText Get dom node's inner html (alias for "innerHtml")
19
 * @property-read string innerHtml Get dom node's inner html
20
 * @property-read string plaintext Get dom node's plain text
21
 *
22
 * @method string outerText() Get dom node's outer html (alias for "outerHtml()")
23
 * @method string outerHtml() Get dom node's outer html
24
 * @method string innerText() Get dom node's inner html (alias for "innerHtml()")
25
 * @method HtmlDomParser load() load($html) Load HTML from string
26
 * @method HtmlDomParser load_file() load_file($html) Load HTML from file
27
 *
28
 * @method static HtmlDomParser file_get_html() file_get_html($html, $libXMLExtraOptions = null) Load HTML from file
29
 * @method static HtmlDomParser str_get_html() str_get_html($html, $libXMLExtraOptions = null) Load HTML from string
30
 */
31
class HtmlDomParser
32
{
33
  /**
34
   * @var array
35
   */
36
  protected static $functionAliases = array(
37
      'outertext' => 'html',
38
      'outerhtml' => 'html',
39
      'innertext' => 'innerHtml',
40
      'innerhtml' => 'innerHtml',
41
      'load'      => 'loadHtml',
42
      'load_file' => 'loadHtmlFile',
43
  );
44
45
  /**
46
   * @var array
47
   */
48
  private static $domLinkReplaceHelper = array(
49
      'orig' => array('[', ']', '{', '}',),
50
      'tmp'  => array(
51
          '!!!!HTML_DOM__SQUARE_BRACKET_LEFT!!!!',
52
          '!!!!HTML_DOM__SQUARE_BRACKET_RIGHT!!!!',
53
          '!!!!HTML_DOM__BRACKET_LEFT!!!!',
54
          '!!!!HTML_DOM__BRACKET_RIGHT!!!!',
55
      ),
56
  );
57
58
  /**
59
   * @var array
60
   */
61
  protected static $domReplaceHelper = array(
62
      'orig' => array('&', '|', '+', '%'),
63
      'tmp'  => array(
64
          '!!!!HTML_DOM__AMP!!!!',
65
          '!!!!HTML_DOM__PIPE!!!!',
66
          '!!!!HTML_DOM__PLUS!!!!',
67
          '!!!!HTML_DOM__PERCENT!!!!',
68
      ),
69
  );
70
71
  /**
72
   * @var Callable
73
   */
74
  protected static $callback;
75
76
  /**
77
   * @var DOMDocument
78
   */
79
  protected $document;
80
81
  /**
82
   * @var string
83
   */
84
  protected $encoding = 'UTF-8';
85
86
  /**
87
   * @var bool
88
   */
89
  protected $isDOMDocumentCreatedWithoutHtml = false;
90
91
  /**
92
   * @var bool
93
   */
94
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
95
96
  /**
97
   * Constructor
98
   *
99
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
100
   */
101 116
  public function __construct($element = null)
102
  {
103 116
    $this->document = new \DOMDocument('1.0', $this->getEncoding());
104
105
    // DOMDocument settings
106 116
    $this->document->preserveWhiteSpace = true;
107 116
    $this->document->formatOutput = true;
108
109 116
    if ($element instanceof SimpleHtmlDom) {
110 50
      $element = $element->getNode();
111
    }
112
113 116
    if ($element instanceof \DOMNode) {
114 50
      $domNode = $this->document->importNode($element, true);
115
116 50
      if ($domNode instanceof \DOMNode) {
117 50
        $this->document->appendChild($domNode);
118
      }
119
120 50
      return;
121
    }
122
123 116
    if ($element !== null) {
124 69
      $this->loadHtml($element);
125
    }
126 115
  }
127
128
  /**
129
   * @param $name
130
   * @param $arguments
131
   *
132
   * @return bool|mixed
133
   */
134 33
  public function __call($name, $arguments)
135
  {
136 33
    $name = strtolower($name);
137
138 33 View Code Duplication
    if (isset(self::$functionAliases[$name])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
139 32
      return call_user_func_array(array($this, self::$functionAliases[$name]), $arguments);
140
    }
141
142 1
    throw new BadMethodCallException('Method does not exist: ' . $name);
143
  }
144
145
  /**
146
   * @param $name
147
   * @param $arguments
148
   *
149
   * @return HtmlDomParser
150
   */
151 14
  public static function __callStatic($name, $arguments)
152
  {
153 14
    $arguments0 = null;
154 14
    if (isset($arguments[0])) {
155 13
      $arguments0 = $arguments[0];
156
    }
157
158 14
    $arguments1 = null;
159 14
    if (isset($arguments[1])) {
160 1
      $arguments1 = $arguments[1];
161
    }
162
163 14
    if ($name == 'str_get_html') {
164 9
      $parser = new self();
165
166 9
      return $parser->loadHtml($arguments0, $arguments1);
167
    }
168
169 5
    if ($name == 'file_get_html') {
170 4
      $parser = new self();
171
172 4
      return $parser->loadHtmlFile($arguments0, $arguments1);
173
    }
174
175 1
    throw new BadMethodCallException('Method does not exist');
176
  }
177
178
  /**
179
   * @param $name
180
   *
181
   * @return string
182
   */
183 13
  public function __get($name)
184
  {
185 13
    $name = strtolower($name);
186
187
    switch ($name) {
188 13
      case 'outerhtml':
189 13
      case 'outertext':
190 7
        return $this->html();
191 6
      case 'innerhtml':
192 3
      case 'innertext':
193 4
        return $this->innerHtml();
194 2
      case 'text':
195 2
      case 'plaintext':
196 1
        return $this->text();
197
    }
198
199 1
    return null;
200
  }
201
202
  /**
203
   * @param string $selector
204
   * @param int    $idx
205
   *
206
   * @return SimpleHtmlDom|SimpleHtmlDomNode|null
207
   */
208 3
  public function __invoke($selector, $idx = null)
209
  {
210 3
    return $this->find($selector, $idx);
211
  }
212
213
  /**
214
   * @return string
215
   */
216 14
  public function __toString()
217
  {
218 14
    return $this->html();
219
  }
220
221
  /**
222
   * does nothing (only for api-compatibility-reasons)
223
   *
224
   * @return bool
225
   */
226 1
  public function clear()
227
  {
228 1
    return true;
229
  }
230
231
  /**
232
   * @param string $html
233
   *
234
   * @return string
235
   */
236 70
  private function replaceToPreserveHtmlEntities($html)
237
  {
238 70
    preg_match_all("/(\bhttps?:\/\/[^\s()<>]+(?:\([\w\d]+\)|[^[:punct:]\s]|\/|\}|\]))/i", $html, $linksOld);
239
240 70
    $linksNew = array();
241 70
    if (!empty($linksOld[1])) {
242 49
      $linksOld = $linksOld[1];
243 49
      foreach ($linksOld as $linkKey => $linkOld) {
244 49
        $linksNew[$linkKey] = str_replace(
245 49
            self::$domLinkReplaceHelper['orig'],
246 49
            self::$domLinkReplaceHelper['tmp'],
247
            $linkOld
248
        );
249
      }
250
    }
251
252 70
    $linksNewCount = count($linksNew);
253 70
    if ($linksNewCount > 0 && count($linksOld) === $linksNewCount) {
254 49
      $search = array_merge($linksOld, self::$domReplaceHelper['orig']);
255 49
      $replace = array_merge($linksNew, self::$domReplaceHelper['tmp']);
256
    } else {
257 22
      $search = self::$domReplaceHelper['orig'];
258 22
      $replace = self::$domReplaceHelper['tmp'];
259
    }
260
261 70
    return str_replace($search, $replace, $html);
262
  }
263
264
  /**
265
   * @param string $html
266
   *
267
   * @return string
268
   */
269 53
  public static function putReplacedBackToPreserveHtmlEntities($html)
270
  {
271 53
    return str_replace(
272
        array_merge(
273 53
            self::$domLinkReplaceHelper['tmp'],
274 53
            self::$domReplaceHelper['tmp'],
275 53
            array('&#13;')
276
        ),
277
        array_merge(
278 53
            self::$domLinkReplaceHelper['orig'],
279 53
            self::$domReplaceHelper['orig'],
280 53
            array('')
281
        ),
282
        $html
283
    );
284
  }
285
286
  /**
287
   * create DOMDocument from HTML
288
   *
289
   * @param string   $html
290
   * @param int|null $libXMLExtraOptions
291
   *
292
   * @return \DOMDocument
293
   */
294 104
  private function createDOMDocument($html, $libXMLExtraOptions = null)
295
  {
296 104
    if (strpos($html, '<') === false) {
297 6
      $this->isDOMDocumentCreatedWithoutHtml = true;
298
    }
299
300 104
    if (strpos($html, '<html') === false) {
301 58
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
302
    }
303
304
    // set error level
305 104
    $internalErrors = libxml_use_internal_errors(true);
306 104
    $disableEntityLoader = libxml_disable_entity_loader(true);
307 104
    libxml_clear_errors();
308
309 104
    $options = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
310
311 104
    if (defined('LIBXML_COMPACT')) {
312 104
      $options |= LIBXML_COMPACT;
313
    }
314
315 104
    if (defined('LIBXML_HTML_NOIMPLIED')) {
316 104
      $options |= LIBXML_HTML_NOIMPLIED;
317
    }
318
319 104
    if (defined('LIBXML_HTML_NODEFDTD')) {
320 104
      $options |= LIBXML_HTML_NODEFDTD;
321
    }
322
323 104
    if ($libXMLExtraOptions !== null) {
324 1
      $options |= $libXMLExtraOptions;
325
    }
326
327 104
    $sxe = simplexml_load_string($html, 'SimpleXMLElement', $options);
328 104
    if ($sxe !== false && count(libxml_get_errors()) === 0) {
329 36
      $this->document = dom_import_simplexml($sxe)->ownerDocument;
330
    } else {
331
332
      // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
333 70
      $html = trim($html);
334 70
      $xmlHackUsed = false;
335 70
      if (stripos('<?xml', $html) !== 0) {
336 70
        $xmlHackUsed = true;
337 70
        $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
338
      }
339
340 70
      $html = $this->replaceToPreserveHtmlEntities($html);
341
342 70
      $this->document->loadHTML($html);
343
344
      // remove the "xml-encoding" hack
345 70
      if ($xmlHackUsed === true) {
346 70
        foreach ($this->document->childNodes as $child) {
347 70
          if ($child->nodeType == XML_PI_NODE) {
348 70
            $this->document->removeChild($child);
349
          }
350
        }
351
      }
352
353 70
      libxml_clear_errors();
354
    }
355
356
    // set encoding
357 104
    $this->document->encoding = $this->getEncoding();
358
359
    // restore lib-xml settings
360 104
    libxml_use_internal_errors($internalErrors);
361 104
    libxml_disable_entity_loader($disableEntityLoader);
362
363 104
    return $this->document;
364
  }
365
366
  /**
367
   * Return SimpleHtmlDom by id.
368
   *
369
   * @param string $id
370
   *
371
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
372
   */
373 2
  public function getElementById($id)
374
  {
375 2
    return $this->find("#$id", 0);
376
  }
377
378
  /**
379
   * Return SimpleHtmlDom by tag name.
380
   *
381
   * @param string $name
382
   *
383
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
384
   */
385 1
  public function getElementByTagName($name)
386
  {
387 1
    $node = $this->document->getElementsByTagName($name)->item(0);
388
389 1
    if ($node !== null) {
390 1
      return new SimpleHtmlDom($node);
391
    } else {
392
      return new SimpleHtmlDomNodeBlank();
393
    }
394
  }
395
396
  /**
397
   * Returns Elements by id
398
   *
399
   * @param string   $id
400
   * @param null|int $idx
401
   *
402
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
403
   */
404
  public function getElementsById($id, $idx = null)
405
  {
406
    return $this->find("#$id", $idx);
407
  }
408
409
  /**
410
   * Returns Elements by tag name
411
   *
412
   * @param string   $name
413
   * @param null|int $idx
414
   *
415
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
416
   */
417 3 View Code Duplication
  public function getElementsByTagName($name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
418
  {
419 3
    $nodesList = $this->document->getElementsByTagName($name);
420
421 3
    $elements = new SimpleHtmlDomNode();
422
423 3
    foreach ($nodesList as $node) {
424 3
      $elements[] = new SimpleHtmlDom($node);
425
    }
426
427 3
    if (null === $idx) {
428 2
      return $elements;
429
    } else {
430 1
      if ($idx < 0) {
431
        $idx = count($elements) + $idx;
432
      }
433
    }
434
435 1
    if (isset($elements[$idx])) {
436 1
      return $elements[$idx];
437
    } else {
438
      return new SimpleHtmlDomNodeBlank();
439
    }
440
  }
441
442
  /**
443
   * Find list of nodes with a CSS selector.
444
   *
445
   * @param string $selector
446
   * @param int    $idx
447
   *
448
   * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeBlank
449
   */
450 76
  public function find($selector, $idx = null)
451
  {
452 76
    $xPathQuery = SelectorConverter::toXPath($selector);
453
454 76
    $xPath = new DOMXPath($this->document);
455 76
    $nodesList = $xPath->query($xPathQuery);
456 76
    $elements = new SimpleHtmlDomNode();
457
458 76
    foreach ($nodesList as $node) {
459 72
      $elements[] = new SimpleHtmlDom($node);
460
    }
461
462 76
    if (null === $idx) {
463 49
      return $elements;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $elements; (voku\helper\SimpleHtmlDomNode) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\SimpleHtmlDomNodeBlank.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
464
    } else {
465 39
      if ($idx < 0) {
466 11
        $idx = count($elements) + $idx;
467
      }
468
    }
469
470 39
    if (isset($elements[$idx])) {
471 36
      return $elements[$idx];
472
    } else {
473 5
      return new SimpleHtmlDomNodeBlank();
474
    }
475
  }
476
477
  /**
478
   * @param string $content
479
   *
480
   * @return string
481
   */
482 44
  protected function fixHtmlOutput($content)
483
  {
484
    // INFO: DOMDocument will encapsulate plaintext into a paragraph tag (<p>),
485
    //          so we try to remove it here again ...
486
487 44
    if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
488 19
      $content = str_replace(
489
          array(
490 19
              "\n",
491
              "\r\n",
492
              "\r",
493
              '<simpleHtmlDomP>',
494
              '</simpleHtmlDomP>',
495
              '<body>',
496
              '</body>',
497
              '<html>',
498
              '</html>',
499
          ),
500 19
          '',
501
          $content
502
      );
503
    }
504
505 44
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
506 5
      $content = str_replace(
507
          array(
508 5
              '<p>',
509
              '</p>',
510
              '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">'
511
          ),
512 5
          '',
513
          $content);
514
    }
515
516 44
    $content = UTF8::html_entity_decode($content);
517 44
    $content = trim($content);
518 44
    $content = UTF8::urldecode($content);
519
520 44
    $content = self::putReplacedBackToPreserveHtmlEntities($content);
521
522 44
    return $content;
523
  }
524
525
  /**
526
   * @return DOMDocument
527
   */
528 35
  public function getDocument()
529
  {
530 35
    return $this->document;
531
  }
532
533
  /**
534
   * Get the encoding to use
535
   *
536
   * @return string
537
   */
538 116
  private function getEncoding()
539
  {
540 116
    return $this->encoding;
541
  }
542
543
  /**
544
   * @return bool
545
   */
546 6
  public function getIsDOMDocumentCreatedWithoutHtml()
547
  {
548 6
    return $this->isDOMDocumentCreatedWithoutHtml;
549
  }
550
551
  /**
552
   * @return bool
553
   */
554 32
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper()
555
  {
556 32
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
557
  }
558
559
  /**
560
   * Get dom node's outer html
561
   *
562
   * @return string
563
   */
564 32
  public function html()
565
  {
566 32
    if ($this::$callback !== null) {
567
      call_user_func_array($this::$callback, array($this));
568
    }
569
570 32
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
571 14
      $content = $this->document->saveHTML($this->document->documentElement);
572
    } else {
573 21
      $content = $this->document->saveHTML();
574
    }
575
576 32
    return $this->fixHtmlOutput($content);
577
  }
578
579
  /**
580
   * Get the HTML as XML.
581
   *
582
   * @return string
583
   */
584 1
  public function xml()
585
  {
586 1
    $xml = $this->document->saveXML(null, LIBXML_NOEMPTYTAG);
587
588
    // remove the XML-header
589 1
    $xml = ltrim(preg_replace('/<\?xml.*\?>/', '', $xml));
590
591 1
    return $this->fixHtmlOutput($xml);
592
  }
593
594
  /**
595
   * Get dom node's inner html
596
   *
597
   * @return string
598
   */
599 13
  public function innerHtml()
600
  {
601 13
    $text = '';
602
603 13
    foreach ($this->document->documentElement->childNodes as $node) {
604 13
      $text .= $this->fixHtmlOutput($this->document->saveHTML($node));
605
    }
606
607 13
    return $text;
608
  }
609
610
  /**
611
   * Load HTML from string
612
   *
613
   * @param string   $html
614
   * @param int|null $libXMLExtraOptions
615
   *
616
   * @return HtmlDomParser
617
   *
618
   * @throws InvalidArgumentException if argument is not string
619
   */
620 107
  public function loadHtml($html, $libXMLExtraOptions = null)
621
  {
622 107
    if (!is_string($html)) {
623 3
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
624
    }
625
626 104
    $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
627
628 104
    return $this;
629
  }
630
631
  /**
632
   * Load HTML from file
633
   *
634
   * @param string   $filePath
635
   * @param int|null $libXMLExtraOptions
636
   *
637
   * @return HtmlDomParser
638
   */
639 12
  public function loadHtmlFile($filePath, $libXMLExtraOptions = null)
640
  {
641 12
    if (!is_string($filePath)) {
642 2
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
643
    }
644
645 10
    if (!preg_match("/^https?:\/\//i", $filePath) && !file_exists($filePath)) {
646 1
      throw new RuntimeException("File $filePath not found");
647
    }
648
649
    try {
650 9
      $html = UTF8::file_get_contents($filePath);
651
652 1
    } catch (\Exception $e) {
653 1
      throw new RuntimeException("Could not load file $filePath");
654
    }
655
656 8
    if ($html === false) {
657
      throw new RuntimeException("Could not load file $filePath");
658
    }
659
660 8
    $this->loadHtml($html, $libXMLExtraOptions);
661
662 8
    return $this;
663
  }
664
665
  /**
666
   * Save dom as string
667
   *
668
   * @param string $filepath
669
   *
670
   * @return string
671
   */
672 1
  public function save($filepath = '')
673
  {
674 1
    $string = $this->innerHtml();
675 1
    if ($filepath !== '') {
676
      file_put_contents($filepath, $string, LOCK_EX);
677
    }
678
679 1
    return $string;
680
  }
681
682
  /**
683
   * @param $functionName
684
   */
685
  public function set_callback($functionName)
686
  {
687
    $this::$callback = $functionName;
688
  }
689
690
  /**
691
   * Get dom node's plain text
692
   *
693
   * @return string
694
   */
695 2
  public function text()
696
  {
697 2
    return $this->fixHtmlOutput($this->document->textContent);
698
  }
699
}
700