Completed
Push — master ( 1483ee...c351f4 )
by Lars
02:59
created

HtmlDomParser::set_callback()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 4
ccs 0
cts 3
cp 0
rs 10
cc 1
eloc 2
nc 1
nop 1
crap 2
1
<?php
2
3
namespace voku\helper;
4
5
use BadMethodCallException;
6
use DOMDocument;
7
use DOMXPath;
8
use InvalidArgumentException;
9
use RuntimeException;
10
11
/**
12
 * Class HtmlDomParser
13
 *
14
 * @package voku\helper
15
 *
16
 * @property-read string outerText Get dom node's outer html (alias for "outerHtml")
17
 * @property-read string outerHtml Get dom node's outer html
18
 * @property-read string innerText Get dom node's inner html (alias for "innerHtml")
19
 * @property-read string innerHtml Get dom node's inner html
20
 * @property-read string plaintext Get dom node's plain text
21
 *
22
 * @method string outerText() Get dom node's outer html (alias for "outerHtml()")
23
 * @method string outerHtml() Get dom node's outer html
24
 * @method string innerText() Get dom node's inner html (alias for "innerHtml()")
25
 * @method HtmlDomParser load() load($html) Load HTML from string
26
 * @method HtmlDomParser load_file() load_file($html) Load HTML from file
27
 *
28
 * @method static HtmlDomParser file_get_html() file_get_html($html, $libXMLExtraOptions = null) Load HTML from file
29
 * @method static HtmlDomParser str_get_html() str_get_html($html, $libXMLExtraOptions = null) Load HTML from string
30
 */
31
class HtmlDomParser
32
{
33
  /**
34
   * @var array
35
   */
36
  protected static $functionAliases = array(
37
      'outertext' => 'html',
38
      'outerhtml' => 'html',
39
      'innertext' => 'innerHtml',
40
      'innerhtml' => 'innerHtml',
41
      'load'      => 'loadHtml',
42
      'load_file' => 'loadHtmlFile',
43
  );
44
45
  /**
46
   * @var array
47
   */
48
  private static $domLinkReplaceHelper = array(
49
      'orig' => array('[', ']', '{', '}',),
50
      'tmp'  => array(
51
          '!!!!HTML_DOM__SQUARE_BRACKET_LEFT!!!!',
52
          '!!!!HTML_DOM__SQUARE_BRACKET_RIGHT!!!!',
53
          '!!!!HTML_DOM__BRACKET_LEFT!!!!',
54
          '!!!!HTML_DOM__BRACKET_RIGHT!!!!',
55
      ),
56
  );
57
58
  /**
59
   * @var array
60
   */
61
  protected static $domReplaceHelper = array(
62
      'orig' => array('&', '|', '+', '%'),
63
      'tmp'  => array(
64
          '!!!!HTML_DOM__AMP!!!!',
65
          '!!!!HTML_DOM__PIPE!!!!',
66
          '!!!!HTML_DOM__PLUS!!!!',
67
          '!!!!HTML_DOM__PERCENT!!!!',
68
      ),
69
  );
70
71
  /**
72
   * @var Callable
73
   */
74
  protected static $callback;
75
76
  /**
77
   * @var DOMDocument
78
   */
79
  protected $document;
80
81
  /**
82
   * @var string
83
   */
84
  protected $encoding = 'UTF-8';
85
86
  /**
87
   * @var bool
88
   */
89
  protected $isDOMDocumentCreatedWithoutHtml = false;
90
91
  /**
92
   * @var bool
93
   */
94
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
95
96
  /**
97
   * Constructor
98
   *
99
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
100
   */
101 115
  public function __construct($element = null)
102
  {
103 115
    $this->document = new \DOMDocument('1.0', $this->getEncoding());
104
105
    // DOMDocument settings
106 115
    $this->document->preserveWhiteSpace = true;
107 115
    $this->document->formatOutput = true;
108
109 115
    if ($element instanceof SimpleHtmlDom) {
110 51
      $element = $element->getNode();
111 51
    }
112
113 115
    if ($element instanceof \DOMNode) {
114 51
      $domNode = $this->document->importNode($element, true);
115
116 51
      if ($domNode instanceof \DOMNode) {
117 51
        $this->document->appendChild($domNode);
118 51
      }
119
120 51
      return;
121
    }
122
123 115
    if ($element !== null) {
124 70
      $this->loadHtml($element);
125 69
    }
126 114
  }
127
128
  /**
129
   * @param $name
130
   * @param $arguments
131
   *
132
   * @return bool|mixed
133
   */
134 32
  public function __call($name, $arguments)
135
  {
136 32
    $name = strtolower($name);
137
138 32 View Code Duplication
    if (isset(self::$functionAliases[$name])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
139 31
      return call_user_func_array(array($this, self::$functionAliases[$name]), $arguments);
140
    }
141
142 1
    throw new BadMethodCallException('Method does not exist: ' . $name);
143
  }
144
145
  /**
146
   * @param $name
147
   * @param $arguments
148
   *
149
   * @return HtmlDomParser
150
   */
151 13
  public static function __callStatic($name, $arguments)
152
  {
153 13
    $arguments0 = null;
154 13
    if (isset($arguments[0])) {
155 12
      $arguments0 = $arguments[0];
156 12
    }
157
158 13
    $arguments1 = null;
159 13
    if (isset($arguments[1])) {
160 1
      $arguments1 = $arguments[1];
161 1
    }
162
163 13
    if ($name == 'str_get_html') {
164 9
      $parser = new self();
165
166 9
      return $parser->loadHtml($arguments0, $arguments1);
167
    }
168
169 4
    if ($name == 'file_get_html') {
170 3
      $parser = new self();
171
172 3
      return $parser->loadHtmlFile($arguments0, $arguments1);
173
    }
174
175 1
    throw new BadMethodCallException('Method does not exist');
176
  }
177
178
  /**
179
   * @param $name
180
   *
181
   * @return string
182
   */
183 14
  public function __get($name)
184
  {
185 14
    $name = strtolower($name);
186
187
    switch ($name) {
188 14
      case 'outerhtml':
189 14
      case 'outertext':
190 8
        return $this->html();
191 6
      case 'innerhtml':
192 6
      case 'innertext':
193 4
        return $this->innerHtml();
194 2
      case 'text':
195 2
      case 'plaintext':
196 1
        return $this->text();
197
    }
198
199 1
    return null;
200
  }
201
202
  /**
203
   * @param string $selector
204
   * @param int    $idx
205
   *
206
   * @return SimpleHtmlDom|SimpleHtmlDomNode|null
207
   */
208 2
  public function __invoke($selector, $idx = null)
209
  {
210 2
    return $this->find($selector, $idx);
211
  }
212
213
  /**
214
   * @return string
215
   */
216 14
  public function __toString()
217
  {
218 14
    return $this->html();
219
  }
220
221
  /**
222
   * does nothing (only for api-compatibility-reasons)
223
   *
224
   * @return bool
225
   */
226 1
  public function clear()
227
  {
228 1
    return true;
229
  }
230
231
  /**
232
   * @param string $html
233
   *
234
   * @return string
235
   */
236 70
  private function replaceToPreserveHtmlEntities($html)
237
  {
238 70
    preg_match_all("/(\bhttps?:\/\/[^\s()<>]+(?:\([\w\d]+\)|[^[:punct:]\s]|\/|\}|\]))/i", $html, $linksOld);
239
240 70
    $linksNew = array();
241 70
    if (!empty($linksOld[1])) {
242 49
      $linksOld = $linksOld[1];
243 49
      foreach ($linksOld as $linkKey => $linkOld) {
244 49
        $linksNew[$linkKey] = str_replace(
245 49
            self::$domLinkReplaceHelper['orig'],
246 49
            self::$domLinkReplaceHelper['tmp'],
247
            $linkOld
248 49
        );
249 49
      }
250 49
    }
251
252 70
    $linksNewCount = count($linksNew);
253 70
    if ($linksNewCount > 0 && count($linksOld) === $linksNewCount) {
254 49
      $search = array_merge($linksOld, self::$domReplaceHelper['orig']);
255 49
      $replace = array_merge($linksNew, self::$domReplaceHelper['tmp']);
256 49
    } else {
257 22
      $search = self::$domReplaceHelper['orig'];
258 22
      $replace = self::$domReplaceHelper['tmp'];
259
    }
260
261 70
    return str_replace($search, $replace, $html);
262
  }
263
264
  /**
265
   * @param string $html
266
   *
267
   * @return string
268
   */
269 53
  public static function putReplacedBackToPreserveHtmlEntities($html)
270
  {
271 53
    return str_replace(
272 53
        array_merge(
273 53
            self::$domLinkReplaceHelper['tmp'],
274 53
            self::$domReplaceHelper['tmp'],
275 53
            array('&#13;')
276 53
        ),
277 53
        array_merge(
278 53
            self::$domLinkReplaceHelper['orig'],
279 53
            self::$domReplaceHelper['orig'],
280 53
            array('')
281 53
        ),
282
        $html
283 53
    );
284
  }
285
286
  /**
287
   * create DOMDocument from HTML
288
   *
289
   * @param string   $html
290
   * @param int|null $libXMLExtraOptions
291
   *
292
   * @return \DOMDocument
293
   */
294 103
  private function createDOMDocument($html, $libXMLExtraOptions = null)
295
  {
296 103
    if (strpos($html, '<') === false) {
297 6
      $this->isDOMDocumentCreatedWithoutHtml = true;
298 6
    }
299
300 103
    if (strpos($html, '<html') === false) {
301 57
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
302 57
    }
303
304
    // set error level
305 103
    $internalErrors = libxml_use_internal_errors(true);
306 103
    $disableEntityLoader = libxml_disable_entity_loader(true);
307 103
    libxml_clear_errors();
308
309 103
    $options = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
310
311 103
    if (defined('LIBXML_COMPACT')) {
312 103
      $options |= LIBXML_COMPACT;
313 103
    }
314
315 103
    if (defined('LIBXML_HTML_NOIMPLIED')) {
316 103
      $options |= LIBXML_HTML_NOIMPLIED;
317 103
    }
318
319 103
    if (defined('LIBXML_HTML_NODEFDTD')) {
320 103
      $options |= LIBXML_HTML_NODEFDTD;
321 103
    }
322
323 103
    if ($libXMLExtraOptions !== null) {
324 1
      $options |= $libXMLExtraOptions;
325 1
    }
326
327 103
    $sxe = simplexml_load_string($html, 'SimpleXMLElement', $options);
328 103
    if ($sxe !== false && count(libxml_get_errors()) === 0) {
329 35
      $this->document = dom_import_simplexml($sxe)->ownerDocument;
330 35
    } else {
331
332
      // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
333 70
      $html = trim($html);
334 70
      $xmlHackUsed = false;
335 70
      if (stripos('<?xml', $html) !== 0) {
336 70
        $xmlHackUsed = true;
337 70
        $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
338 70
      }
339
340 70
      $html = $this->replaceToPreserveHtmlEntities($html);
341
342 70
      $this->document->loadHTML($html);
343
344
      // remove the "xml-encoding" hack
345 70
      if ($xmlHackUsed === true) {
346 70
        foreach ($this->document->childNodes as $child) {
347 70
          if ($child->nodeType == XML_PI_NODE) {
348 70
            $this->document->removeChild($child);
349 70
          }
350 70
        }
351 70
      }
352
353 70
      libxml_clear_errors();
354
    }
355
356
    // set encoding
357 103
    $this->document->encoding = $this->getEncoding();
358
359
    // restore lib-xml settings
360 103
    libxml_use_internal_errors($internalErrors);
361 103
    libxml_disable_entity_loader($disableEntityLoader);
362
363 103
    return $this->document;
364
  }
365
366
  /**
367
   * Return SimpleHtmlDom by id.
368
   *
369
   * @param string $id
370
   *
371
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
372
   */
373 2
  public function getElementById($id)
374
  {
375 2
    return $this->find("#$id", 0);
376
  }
377
378
  /**
379
   * Return SimpleHtmlDom by tag name.
380
   *
381
   * @param string $name
382
   *
383
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
384
   */
385 1
  public function getElementByTagName($name)
386
  {
387 1
    $node = $this->document->getElementsByTagName($name)->item(0);
388
389 1
    if ($node !== null) {
390 1
      return new SimpleHtmlDom($node);
391
    } else {
392
      return new SimpleHtmlDomNodeBlank();
393
    }
394
  }
395
396
  /**
397
   * Returns Elements by id
398
   *
399
   * @param string   $id
400
   * @param null|int $idx
401
   *
402
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
403
   */
404
  public function getElementsById($id, $idx = null)
405
  {
406
    return $this->find("#$id", $idx);
407
  }
408
409
  /**
410
   * Returns Elements by tag name
411
   *
412
   * @param string   $name
413
   * @param null|int $idx
414
   *
415
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
416
   */
417 3 View Code Duplication
  public function getElementsByTagName($name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
418
  {
419 3
    $nodesList = $this->document->getElementsByTagName($name);
420
421 3
    $elements = new SimpleHtmlDomNode();
422
423 3
    foreach ($nodesList as $node) {
424 3
      $elements[] = new SimpleHtmlDom($node);
425 3
    }
426
427 3
    if (null === $idx) {
428 2
      return $elements;
429
    } else {
430 1
      if ($idx < 0) {
431
        $idx = count($elements) + $idx;
432
      }
433
    }
434
435 1
    if (isset($elements[$idx])) {
436 1
      return $elements[$idx];
437
    } else {
438
      return new SimpleHtmlDomNodeBlank();
439
    }
440
  }
441
442
  /**
443
   * Find list of nodes with a CSS selector.
444
   *
445
   * @param string $selector
446
   * @param int    $idx
447
   *
448
   * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeBlank
449
   */
450 75
  public function find($selector, $idx = null)
451
  {
452 75
    $xPathQuery = SelectorConverter::toXPath($selector);
453
454 75
    $xPath = new DOMXPath($this->document);
455 75
    $nodesList = $xPath->query($xPathQuery);
456 75
    $elements = new SimpleHtmlDomNode();
457
458 75
    foreach ($nodesList as $node) {
459 71
      $elements[] = new SimpleHtmlDom($node);
460 75
    }
461
462 75
    if (null === $idx) {
463 48
      return $elements;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $elements; (voku\helper\SimpleHtmlDomNode) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\SimpleHtmlDomNodeBlank.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
464
    } else {
465 39
      if ($idx < 0) {
466 11
        $idx = count($elements) + $idx;
467 11
      }
468
    }
469
470 39
    if (isset($elements[$idx])) {
471 36
      return $elements[$idx];
472
    } else {
473 5
      return new SimpleHtmlDomNodeBlank();
474
    }
475
  }
476
477
  /**
478
   * @param string $content
479
   *
480
   * @return string
481
   */
482 44
  protected function fixHtmlOutput($content)
483
  {
484
    // INFO: DOMDocument will encapsulate plaintext into a paragraph tag (<p>),
485
    //          so we try to remove it here again ...
486
487 44
    if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
488 19
      $content = str_replace(
489
          array(
490 19
              "\n",
491 19
              "\r\n",
492 19
              "\r",
493 19
              '<simpleHtmlDomP>',
494 19
              '</simpleHtmlDomP>',
495 19
              '<body>',
496 19
              '</body>',
497 19
              '<html>',
498 19
              '</html>',
499 19
          ),
500 19
          '',
501
          $content
502 19
      );
503 19
    }
504
505 44
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
506 5
      $content = str_replace(
507
          array(
508 5
              '<p>',
509 5
              '</p>',
510
              '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">'
511 5
          ),
512 5
          '',
513 5
          $content);
514 5
    }
515
516 44
    $content = UTF8::html_entity_decode($content);
517 44
    $content = trim($content);
518 44
    $content = UTF8::urldecode($content);
519
520 44
    $content = self::putReplacedBackToPreserveHtmlEntities($content);
521
522 44
    return $content;
523
  }
524
525
  /**
526
   * @return DOMDocument
527
   */
528 36
  public function getDocument()
529
  {
530 36
    return $this->document;
531
  }
532
533
  /**
534
   * Get the encoding to use
535
   *
536
   * @return string
537
   */
538 115
  private function getEncoding()
539
  {
540 115
    return $this->encoding;
541
  }
542
543
  /**
544
   * @return bool
545
   */
546 7
  public function getIsDOMDocumentCreatedWithoutHtml()
547
  {
548 7
    return $this->isDOMDocumentCreatedWithoutHtml;
549
  }
550
551
  /**
552
   * @return bool
553
   */
554 32
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper()
555
  {
556 32
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
557
  }
558
559
  /**
560
   * Get dom node's outer html
561
   *
562
   * @return string
563
   */
564 32
  public function html()
565
  {
566 32
    if ($this::$callback !== null) {
567
      call_user_func_array($this::$callback, array($this));
568
    }
569
570 32
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
571 14
      $content = $this->document->saveHTML($this->document->documentElement);
572 14
    } else {
573 22
      $content = $this->document->saveHTML();
574
    }
575
576 32
    return $this->fixHtmlOutput($content);
577
  }
578
579
  /**
580
   * Get the HTML as XML.
581
   *
582
   * @return string
583
   */
584 1
  public function xml()
585
  {
586 1
    $xml = $this->document->saveXML(null, LIBXML_NOEMPTYTAG);
587
588
    // remove the XML-header
589 1
    $xml = ltrim(preg_replace('/<\?xml.*\?>/', '', $xml));
590
591 1
    return $this->fixHtmlOutput($xml);
592
  }
593
594
  /**
595
   * Get dom node's inner html
596
   *
597
   * @return string
598
   */
599 13
  public function innerHtml()
600
  {
601 13
    $text = '';
602
603 13
    foreach ($this->document->documentElement->childNodes as $node) {
604 13
      $text .= $this->fixHtmlOutput($this->document->saveHTML($node));
605 13
    }
606
607 13
    return $text;
608
  }
609
610
  /**
611
   * Load HTML from string
612
   *
613
   * @param string   $html
614
   * @param int|null $libXMLExtraOptions
615
   *
616
   * @return HtmlDomParser
617
   *
618
   * @throws InvalidArgumentException if argument is not string
619
   */
620 106
  public function loadHtml($html, $libXMLExtraOptions = null)
621
  {
622 106
    if (!is_string($html)) {
623 3
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
624
    }
625
626 103
    $this->document = $this->createDOMDocument($html, $libXMLExtraOptions);
627
628 103
    return $this;
629
  }
630
631
  /**
632
   * Load HTML from file
633
   *
634
   * @param string   $filePath
635
   * @param int|null $libXMLExtraOptions
636
   *
637
   * @return HtmlDomParser
638
   */
639 11
  public function loadHtmlFile($filePath, $libXMLExtraOptions = null)
640
  {
641 11
    if (!is_string($filePath)) {
642 2
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
643
    }
644
645 9
    if (!preg_match("/^https?:\/\//i", $filePath) && !file_exists($filePath)) {
646 1
      throw new RuntimeException("File $filePath not found");
647
    }
648
649
    try {
650 8
      $html = file_get_contents($filePath);
651
652 8
    } catch (\Exception $e) {
653 1
      throw new RuntimeException("Could not load file $filePath");
654
    }
655
656 7
    if ($html === false) {
657
      throw new RuntimeException("Could not load file $filePath");
658
    }
659
660 7
    $this->loadHtml($html, $libXMLExtraOptions);
661
662 7
    return $this;
663
  }
664
665
  /**
666
   * Save dom as string
667
   *
668
   * @param string $filepath
669
   *
670
   * @return string
671
   */
672 1
  public function save($filepath = '')
673
  {
674 1
    $string = $this->innerHtml();
675 1
    if ($filepath !== '') {
676
      file_put_contents($filepath, $string, LOCK_EX);
677
    }
678
679 1
    return $string;
680
  }
681
682
  /**
683
   * @param $functionName
684
   */
685
  public function set_callback($functionName)
686
  {
687
    $this::$callback = $functionName;
688
  }
689
690
  /**
691
   * Get dom node's plain text
692
   *
693
   * @return string
694
   */
695 2
  public function text()
696
  {
697 2
    return $this->fixHtmlOutput($this->document->textContent);
698
  }
699
}
700