Completed
Push — master ( 68bca5...9cf0fc )
by Lars
02:26
created

HtmlDomParser::replaceToPreserveHtmlEntities()   B

Complexity

Conditions 5
Paths 4

Size

Total Lines 27
Code Lines 18

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 20
CRAP Score 5

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 27
ccs 20
cts 20
cp 1
rs 8.439
cc 5
eloc 18
nc 4
nop 1
crap 5
1
<?php
2
3
namespace voku\helper;
4
5
use BadMethodCallException;
6
use DOMDocument;
7
use DOMXPath;
8
use InvalidArgumentException;
9
use RuntimeException;
10
11
/**
12
 * Class HtmlDomParser
13
 *
14
 * @package voku\helper
15
 *
16
 * @property-read string outerText Get dom node's outer html (alias for "outerHtml")
17
 * @property-read string outerHtml Get dom node's outer html
18
 * @property-read string innerText Get dom node's inner html (alias for "innerHtml")
19
 * @property-read string innerHtml Get dom node's inner html
20
 * @property-read string plaintext Get dom node's plain text
21
 *
22
 * @method string outerText() Get dom node's outer html (alias for "outerHtml()")
23
 * @method string outerHtml() Get dom node's outer html
24
 * @method string innerText() Get dom node's inner html (alias for "innerHtml()")
25
 * @method HtmlDomParser load() load($html) Load HTML from string
26
 * @method HtmlDomParser load_file() load_file($html) Load HTML from file
27
 *
28
 * @method static HtmlDomParser file_get_html() file_get_html($html, $libXMLExtraOptions = null) Load HTML from file
29
 * @method static HtmlDomParser str_get_html() str_get_html($html, $libXMLExtraOptions = null) Load HTML from string
30
 */
31
class HtmlDomParser
32
{
33
  /**
34
   * @var array
35
   */
36
  protected static $functionAliases = array(
37
      'outertext' => 'html',
38
      'outerhtml' => 'html',
39
      'innertext' => 'innerHtml',
40
      'innerhtml' => 'innerHtml',
41
      'load'      => 'loadHtml',
42
      'load_file' => 'loadHtmlFile',
43
  );
44
45
  /**
46
   * @var array
47
   */
48
  protected static $domLinkReplaceHelper = array(
49
      'orig' => array('[', ']', '{', '}',),
50
      'tmp'  => array(
51
          '!!!!HTML_DOM__SQUARE_BRACKET_LEFT!!!!',
52
          '!!!!HTML_DOM__SQUARE_BRACKET_RIGHT!!!!',
53
          '!!!!HTML_DOM__BRACKET_LEFT!!!!',
54
          '!!!!HTML_DOM__BRACKET_RIGHT!!!!',
55
      ),
56
  );
57
58
  /**
59
   * @var array
60
   */
61
  protected static $domReplaceHelper = array(
62
      'orig' => array('&', '|', '+', '%'),
63
      'tmp'  => array(
64
          '!!!!HTML_DOM__AMP!!!!',
65
          '!!!!HTML_DOM__PIPE!!!!',
66
          '!!!!HTML_DOM__PLUS!!!!',
67
          '!!!!HTML_DOM__PERCENT!!!!',
68
      ),
69
  );
70
71
  /**
72
   * @var Callable
73
   */
74
  protected static $callback;
75
76
  /**
77
   * @var DOMDocument
78
   */
79
  protected $document;
80
81
  /**
82
   * @var string
83
   */
84
  protected $encoding = 'UTF-8';
85
86
  /**
87
   * @var bool
88
   */
89
  protected $isDOMDocumentCreatedWithoutHtml = false;
90
91
  /**
92
   * @var bool
93
   */
94
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
95
96
  /**
97
   * An random md5-hash, generated via "random_bytes()".
98
   * @var string
99
   */
100
  protected $randomHash;
101
102
  /**
103
   * Constructor
104
   *
105
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
106
   */
107 118
  public function __construct($element = null)
108
  {
109 118
    $this->randomHash = md5(Bootup::get_random_bytes(16));
110 118
    $this->document = new \DOMDocument('1.0', $this->getEncoding());
111
112 118
    $this->addRandBytesToDomReplaceHelpers();
113
114
    // DOMDocument settings
115 118
    $this->document->preserveWhiteSpace = true;
116 118
    $this->document->formatOutput = true;
117
118 118
    if ($element instanceof SimpleHtmlDom) {
119 51
      $element = $element->getNode();
120 51
    }
121
122 118
    if ($element instanceof \DOMNode) {
123 51
      $domNode = $this->document->importNode($element, true);
124
125 51
      if ($domNode instanceof \DOMNode) {
126 51
        $this->document->appendChild($domNode);
127 51
      }
128
129 51
      return;
130
    }
131
132 118
    if ($element !== null) {
133 70
      $this->loadHtml($element);
134 69
    }
135 117
  }
136
137
  /**
138
   * Add rand-bytes to the "Dom-Replace-Helper"-variables.
139
   */
140 118
  protected function addRandBytesToDomReplaceHelpers()
141
  {
142 118
    foreach (self::$domLinkReplaceHelper['tmp'] as &$linkHelper) {
143 118
      $linkHelper .= $this->randomHash;
144 118
    }
145
146 118
    foreach (self::$domReplaceHelper['tmp'] as &$domHelper) {
147 118
      $domHelper .= $this->randomHash;
148 118
    }
149 118
  }
150
151
  /**
152
   * @param $name
153
   * @param $arguments
154
   *
155
   * @return bool|mixed
156
   */
157 33
  public function __call($name, $arguments)
158
  {
159 33
    $name = strtolower($name);
160
161 33 View Code Duplication
    if (isset(self::$functionAliases[$name])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
162 32
      return call_user_func_array(array($this, self::$functionAliases[$name]), $arguments);
163
    }
164
165 1
    throw new BadMethodCallException('Method does not exist: ' . $name);
166
  }
167
168
  /**
169
   * @param $name
170
   * @param $arguments
171
   *
172
   * @return HtmlDomParser
173
   */
174 14
  public static function __callStatic($name, $arguments)
175
  {
176 14
    $arguments0 = null;
177 14
    if (isset($arguments[0])) {
178 13
      $arguments0 = $arguments[0];
179 13
    }
180
181 14
    $arguments1 = null;
182 14
    if (isset($arguments[1])) {
183 1
      $arguments1 = $arguments[1];
184 1
    }
185
186 14
    if ($name == 'str_get_html') {
187 9
      $parser = new self();
188
189 9
      return $parser->loadHtml($arguments0, $arguments1);
190
    }
191
192 5
    if ($name == 'file_get_html') {
193 4
      $parser = new self();
194
195 4
      return $parser->loadHtmlFile($arguments0, $arguments1);
196
    }
197
198 1
    throw new BadMethodCallException('Method does not exist');
199
  }
200
201
  /**
202
   * @param $name
203
   *
204
   * @return string
205
   */
206 13
  public function __get($name)
207
  {
208 13
    $name = strtolower($name);
209
210
    switch ($name) {
211 13
      case 'outerhtml':
212 13
      case 'outertext':
213 7
        return $this->html();
214 6
      case 'innerhtml':
215 6
      case 'innertext':
216 4
        return $this->innerHtml();
217 2
      case 'text':
218 2
      case 'plaintext':
219 1
        return $this->text();
220
    }
221
222 1
    return null;
223
  }
224
225
  /**
226
   * @param string $selector
227
   * @param int    $idx
228
   *
229
   * @return SimpleHtmlDom|SimpleHtmlDomNode|null
230
   */
231 3
  public function __invoke($selector, $idx = null)
232
  {
233 3
    return $this->find($selector, $idx);
234
  }
235
236
  /**
237
   * @return string
238
   */
239 14
  public function __toString()
240
  {
241 14
    return $this->html();
242
  }
243
244
  /**
245
   * does nothing (only for api-compatibility-reasons)
246
   *
247
   * @return bool
248
   */
249 1
  public function clear()
250
  {
251 1
    return true;
252
  }
253
254
  /**
255
   * @param string $html
256
   *
257
   * @return string
258
   */
259 71
  protected function replaceToPreserveHtmlEntities($html)
260
  {
261 71
    preg_match_all("/(\bhttps?:\/\/[^\s()<>]+(?:\([\w\d]+\)|[^[:punct:]\s]|\/|\}|\]))/i", $html, $linksOld);
262
263 71
    $linksNew = array();
264 71
    if (!empty($linksOld[1])) {
265 49
      $linksOld = $linksOld[1];
266 49
      foreach ($linksOld as $linkKey => $linkOld) {
267 49
        $linksNew[$linkKey] = str_replace(
268 49
            self::$domLinkReplaceHelper['orig'],
269 49
            self::$domLinkReplaceHelper['tmp'],
270
            $linkOld
271 49
        );
272 49
      }
273 49
    }
274
275 71
    $linksNewCount = count($linksNew);
276 71
    if ($linksNewCount > 0 && count($linksOld) === $linksNewCount) {
277 49
      $search = array_merge($linksOld, self::$domReplaceHelper['orig']);
278 49
      $replace = array_merge($linksNew, self::$domReplaceHelper['tmp']);
279 49
    } else {
280 23
      $search = self::$domReplaceHelper['orig'];
281 23
      $replace = self::$domReplaceHelper['tmp'];
282
    }
283
284 71
    return str_replace($search, $replace, $html);
285
  }
286
287
  /**
288
   * @param string $html
289
   *
290
   * @return string
291
   */
292 55
  public static function putReplacedBackToPreserveHtmlEntities($html)
293
  {
294 55
    return str_replace(
295 55
        array_merge(
296 55
            self::$domLinkReplaceHelper['tmp'],
297 55
            self::$domReplaceHelper['tmp']
298 55
        ),
299 55
        array_merge(
300 55
            self::$domLinkReplaceHelper['orig'],
301 55
            self::$domReplaceHelper['orig']
302 55
        ),
303
        $html
304 55
    );
305
  }
306
307
  /**
308
   * create DOMDocument from HTML
309
   *
310
   * @param string   $html
311
   * @param int|null $libXMLExtraOptions
312
   *
313
   * @return \DOMDocument
314
   */
315 106
  private function createDOMDocument($html, $libXMLExtraOptions = null)
316
  {
317 106
    if (strpos($html, '<') === false) {
318 6
      $this->isDOMDocumentCreatedWithoutHtml = true;
319 6
    }
320
321 106
    if (strpos($html, '<html') === false) {
322 58
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
323 58
    }
324
325
    // set error level
326 106
    $internalErrors = libxml_use_internal_errors(true);
327 106
    $disableEntityLoader = libxml_disable_entity_loader(true);
328 106
    libxml_clear_errors();
329
330 106
    $options = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
331
332 106
    if (defined('LIBXML_COMPACT')) {
333 106
      $options |= LIBXML_COMPACT;
334 106
    }
335
336 106
    if (defined('LIBXML_HTML_NOIMPLIED')) {
337 106
      $options |= LIBXML_HTML_NOIMPLIED;
338 106
    }
339
340 106
    if (defined('LIBXML_HTML_NODEFDTD')) {
341 106
      $options |= LIBXML_HTML_NODEFDTD;
342 106
    }
343
344 106
    if ($libXMLExtraOptions !== null) {
345 1
      $options |= $libXMLExtraOptions;
346 1
    }
347
348 106
    $sxe = simplexml_load_string($html, 'SimpleXMLElement', $options);
349 106
    if ($sxe !== false && count(libxml_get_errors()) === 0) {
350 37
      $this->document = dom_import_simplexml($sxe)->ownerDocument;
351 37
    } else {
352
353
      // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
354 71
      $html = trim($html);
355 71
      $xmlHackUsed = false;
356 71
      if (stripos('<?xml', $html) !== 0) {
357 71
        $xmlHackUsed = true;
358 71
        $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
359 71
      }
360
361 71
      $html = $this->replaceToPreserveHtmlEntities($html);
362
363 71
      $this->document->loadHTML($html);
364
365
      // remove the "xml-encoding" hack
366 71
      if ($xmlHackUsed === true) {
367 71
        foreach ($this->document->childNodes as $child) {
368 71
          if ($child->nodeType == XML_PI_NODE) {
369 71
            $this->document->removeChild($child);
370 71
          }
371 71
        }
372 71
      }
373
374 71
      libxml_clear_errors();
375
    }
376
377
    // set encoding
378 106
    $this->document->encoding = $this->getEncoding();
379
380
    // restore lib-xml settings
381 106
    libxml_use_internal_errors($internalErrors);
382 106
    libxml_disable_entity_loader($disableEntityLoader);
383
384 106
    return $this->document;
385
  }
386
387
  /**
388
   * Return SimpleHtmlDom by id.
389
   *
390
   * @param string $id
391
   *
392
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
393
   */
394 2
  public function getElementById($id)
395
  {
396 2
    return $this->find("#$id", 0);
397
  }
398
399
  /**
400
   * Return SimpleHtmlDom by tag name.
401
   *
402
   * @param string $name
403
   *
404
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
405
   */
406 1
  public function getElementByTagName($name)
407
  {
408 1
    $node = $this->document->getElementsByTagName($name)->item(0);
409
410 1
    if ($node !== null) {
411 1
      return new SimpleHtmlDom($node);
412
    } else {
413
      return new SimpleHtmlDomNodeBlank();
414
    }
415
  }
416
417
  /**
418
   * Returns Elements by id
419
   *
420
   * @param string   $id
421
   * @param null|int $idx
422
   *
423
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
424
   */
425
  public function getElementsById($id, $idx = null)
426
  {
427
    return $this->find("#$id", $idx);
428
  }
429
430
  /**
431
   * Returns Elements by tag name
432
   *
433
   * @param string   $name
434
   * @param null|int $idx
435
   *
436
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
437
   */
438 3 View Code Duplication
  public function getElementsByTagName($name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
439
  {
440 3
    $nodesList = $this->document->getElementsByTagName($name);
441
442 3
    $elements = new SimpleHtmlDomNode();
443
444 3
    foreach ($nodesList as $node) {
445 3
      $elements[] = new SimpleHtmlDom($node);
446 3
    }
447
448 3
    if (null === $idx) {
449 2
      return $elements;
450
    } else {
451 1
      if ($idx < 0) {
452
        $idx = count($elements) + $idx;
453
      }
454
    }
455
456 1
    if (isset($elements[$idx])) {
457 1
      return $elements[$idx];
458
    } else {
459
      return new SimpleHtmlDomNodeBlank();
460
    }
461
  }
462
463
  /**
464
   * Find list of nodes with a CSS selector.
465
   *
466
   * @param string $selector
467
   * @param int    $idx
468
   *
469
   * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeBlank
470
   */
471 78
  public function find($selector, $idx = null)
472
  {
473 78
    $xPathQuery = SelectorConverter::toXPath($selector);
474
475 78
    $xPath = new DOMXPath($this->document);
476 78
    $nodesList = $xPath->query($xPathQuery);
477 78
    $elements = new SimpleHtmlDomNode();
478
479 78
    foreach ($nodesList as $node) {
480 74
      $elements[] = new SimpleHtmlDom($node);
481 78
    }
482
483 78
    if (null === $idx) {
484 51
      return $elements;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $elements; (voku\helper\SimpleHtmlDomNode) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\SimpleHtmlDomNodeBlank.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
485
    } else {
486 39
      if ($idx < 0) {
487 11
        $idx = count($elements) + $idx;
488 11
      }
489
    }
490
491 39
    if (isset($elements[$idx])) {
492 36
      return $elements[$idx];
493
    } else {
494 5
      return new SimpleHtmlDomNodeBlank();
495
    }
496
  }
497
498
  /**
499
   * @param string $content
500
   *
501
   * @return string
502
   */
503 46
  protected function fixHtmlOutput($content)
504
  {
505
    // INFO: DOMDocument will encapsulate plaintext into a paragraph tag (<p>),
506
    //          so we try to remove it here again ...
507
508 46
    if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
509 19
      $content = str_replace(
510
          array(
511 19
              "\n",
512 19
              "\r\n",
513 19
              "\r",
514 19
              '<simpleHtmlDomP>',
515 19
              '</simpleHtmlDomP>',
516 19
              '<body>',
517 19
              '</body>',
518 19
              '<html>',
519 19
              '</html>',
520 19
          ),
521 19
          '',
522
          $content
523 19
      );
524 19
    }
525
526 46
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
527 5
      $content = str_replace(
528
          array(
529 5
              '<p>',
530 5
              '</p>',
531
              '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">'
532 5
          ),
533 5
          '',
534 5
          $content);
535 5
    }
536
537 46
    $content = UTF8::html_entity_decode($content);
538 46
    $content = trim($content);
539 46
    $content = UTF8::rawurldecode($content);
540
541 46
    $content = self::putReplacedBackToPreserveHtmlEntities($content);
542
543 46
    return $content;
544
  }
545
546
  /**
547
   * @return DOMDocument
548
   */
549 35
  public function getDocument()
550
  {
551 35
    return $this->document;
552
  }
553
554
  /**
555
   * Get the encoding to use
556
   *
557
   * @return string
558
   */
559 118
  private function getEncoding()
560
  {
561 118
    return $this->encoding;
562
  }
563
564
  /**
565
   * @return bool
566
   */
567 6
  public function getIsDOMDocumentCreatedWithoutHtml()
568
  {
569 6
    return $this->isDOMDocumentCreatedWithoutHtml;
570
  }
571
572
  /**
573
   * @return bool
574
   */
575 33
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper()
576
  {
577 33
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
578
  }
579
580
  /**
581
   * Get dom node's outer html
582
   *
583
   * @return string
584
   */
585 33
  public function html()
586
  {
587 33
    if ($this::$callback !== null) {
588
      call_user_func_array($this::$callback, array($this));
589
    }
590
591 33
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
592 14
      $content = $this->document->saveHTML($this->document->documentElement);
593 14
    } else {
594 22
      $content = $this->document->saveHTML();
595
    }
596
597 33
    return $this->fixHtmlOutput($content);
598
  }
599
600
  /**
601
   * Get the HTML as XML.
602
   *
603
   * @return string
604
   */
605 1
  public function xml()
606
  {
607 1
    $xml = $this->document->saveXML(null, LIBXML_NOEMPTYTAG);
608
609
    // remove the XML-header
610 1
    $xml = ltrim(preg_replace('/<\?xml.*\?>/', '', $xml));
611
612 1
    return $this->fixHtmlOutput($xml);
613
  }
614
615
  /**
616
   * Get dom node's inner html
617
   *
618
   * @return string
619
   */
620 14
  public function innerHtml()
621
  {
622 14
    $text = '';
623
624 14
    foreach ($this->document->documentElement->childNodes as $node) {
625 14
      $text .= $this->fixHtmlOutput($this->document->saveHTML($node));
626 14
    }
627
628 14
    return $text;
629
  }
630
631
  /**
632
   * Load HTML from string
633
   *
634
   * @param string   $html
635
   * @param int|null $libXMLExtraOptions
636
   *
637
   * @return HtmlDomParser
638
   *
639
   * @throws InvalidArgumentException if argument is not string
640
   */
641 109
  public function loadHtml($html, $libXMLExtraOptions = null)
642
  {
643 109
    if (!is_string($html)) {
644 3
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
645
    }
646
647 106
    $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
648
649 106
    return $this;
650
  }
651
652
  /**
653
   * Load HTML from file
654
   *
655
   * @param string   $filePath
656
   * @param int|null $libXMLExtraOptions
657
   *
658
   * @return HtmlDomParser
659
   */
660 12
  public function loadHtmlFile($filePath, $libXMLExtraOptions = null)
661
  {
662 12
    if (!is_string($filePath)) {
663 2
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
664
    }
665
666 10
    if (!preg_match("/^https?:\/\//i", $filePath) && !file_exists($filePath)) {
667 1
      throw new RuntimeException("File $filePath not found");
668
    }
669
670
    try {
671 9
      $html = UTF8::file_get_contents($filePath);
672
673 9
    } catch (\Exception $e) {
674 1
      throw new RuntimeException("Could not load file $filePath");
675
    }
676
677 8
    if ($html === false) {
678
      throw new RuntimeException("Could not load file $filePath");
679
    }
680
681 8
    $this->loadHtml($html, $libXMLExtraOptions);
682
683 8
    return $this;
684
  }
685
686
  /**
687
   * Save dom as string
688
   *
689
   * @param string $filepath
690
   *
691
   * @return string
692
   */
693 1
  public function save($filepath = '')
694
  {
695 1
    $string = $this->innerHtml();
696 1
    if ($filepath !== '') {
697
      file_put_contents($filepath, $string, LOCK_EX);
698
    }
699
700 1
    return $string;
701
  }
702
703
  /**
704
   * @param $functionName
705
   */
706
  public function set_callback($functionName)
707
  {
708
    $this::$callback = $functionName;
709
  }
710
711
  /**
712
   * Get dom node's plain text
713
   *
714
   * @return string
715
   */
716 2
  public function text()
717
  {
718 2
    return $this->fixHtmlOutput($this->document->textContent);
719
  }
720
}
721