Completed
Push — master ( 57d66c...0d0990 )
by Lars
02:34
created

HtmlDomParser::__invoke()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 4
ccs 2
cts 2
cp 1
rs 10
cc 1
eloc 2
nc 1
nop 2
crap 1
1
<?php
2
3
namespace voku\helper;
4
5
use BadMethodCallException;
6
use DOMDocument;
7
use DOMXPath;
8
use InvalidArgumentException;
9
use RuntimeException;
10
11
/**
12
 * Class HtmlDomParser
13
 *
14
 * @package voku\helper
15
 *
16
 * @property-read string outerText Get dom node's outer html (alias for "outerHtml")
17
 * @property-read string outerHtml Get dom node's outer html
18
 * @property-read string innerText Get dom node's inner html (alias for "innerHtml")
19
 * @property-read string innerHtml Get dom node's inner html
20
 * @property-read string plaintext Get dom node's plain text
21
 *
22
 * @method string outerText() Get dom node's outer html (alias for "outerHtml()")
23
 * @method string outerHtml() Get dom node's outer html
24
 * @method string innerText() Get dom node's inner html (alias for "innerHtml()")
25
 * @method HtmlDomParser load() load($html) Load HTML from string
26
 * @method HtmlDomParser load_file() load_file($html) Load HTML from file
27
 *
28
 * @method static HtmlDomParser file_get_html() file_get_html($html) Load HTML from file
29
 * @method static HtmlDomParser str_get_html() str_get_html($html) Load HTML from string
30
 */
31
class HtmlDomParser
32
{
33
  /**
34
   * @var array
35
   */
36
  protected static $functionAliases = array(
37
      'outertext' => 'html',
38
      'outerhtml' => 'html',
39
      'innertext' => 'innerHtml',
40
      'innerhtml' => 'innerHtml',
41
      'load'      => 'loadHtml',
42
      'load_file' => 'loadHtmlFile',
43
  );
44
45
  /**
46
   * @var array
47
   */
48
  private static $domLinkReplaceHelper = array(
49
      'orig' => array('[', ']', '{', '}',),
50
      'tmp'  => array(
51
          '!!!!HTML_DOM__SQUARE_BRACKET_LEFT!!!!',
52
          '!!!!HTML_DOM__SQUARE_BRACKET_RIGHT!!!!',
53
          '!!!!HTML_DOM__BRACKET_LEFT!!!!',
54
          '!!!!HTML_DOM__BRACKET_RIGHT!!!!',
55
      ),
56
  );
57
58
  /**
59
   * @var array
60
   */
61
  protected static $domReplaceHelper = array(
62
      'orig' => array('&', '|', '+', '%'),
63
      'tmp'  => array(
64
          '!!!!HTML_DOM__AMP!!!!',
65
          '!!!!HTML_DOM__PIPE!!!!',
66
          '!!!!HTML_DOM__PLUS!!!!',
67
          '!!!!HTML_DOM__PERCENT!!!!',
68
      ),
69
  );
70
71
  /**
72
   * @var Callable
73
   */
74
  protected static $callback;
75
76
  /**
77
   * @var DOMDocument
78
   */
79
  protected $document;
80
81
  /**
82
   * @var string
83
   */
84
  protected $encoding = 'UTF-8';
85
86
  /**
87
   * @var bool
88
   */
89
  protected $isDOMDocumentCreatedWithoutHtml = false;
90
91
  /**
92
   * @var bool
93
   */
94
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
95
96
  /**
97
   * Constructor
98
   *
99
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
100
   */
101 112
  public function __construct($element = null)
102
  {
103 112
    $this->document = new \DOMDocument('1.0', $this->getEncoding());
104
105
    // DOMDocument settings
106 112
    $this->document->preserveWhiteSpace = false;
107 112
    $this->document->formatOutput = true;
108
109 112
    if ($element instanceof SimpleHtmlDom) {
110 51
      $element = $element->getNode();
111
    }
112
113 112
    if ($element instanceof \DOMNode) {
114 51
      $domNode = $this->document->importNode($element, true);
115
116 51
      if ($domNode instanceof \DOMNode) {
117 51
        $this->document->appendChild($domNode);
118
      }
119
120 51
      return;
121
    }
122
123 112
    if ($element !== null) {
124 69
      $this->loadHtml($element);
125
    }
126 111
  }
127
128
  /**
129
   * @param $name
130
   * @param $arguments
131
   *
132
   * @return bool|mixed
133
   */
134 31
  public function __call($name, $arguments)
135
  {
136 31
    $name = strtolower($name);
137
138 31
    if (isset(self::$functionAliases[$name])) {
139 30
      return call_user_func_array(array($this, self::$functionAliases[$name]), $arguments);
140
    }
141
142 1
    throw new BadMethodCallException('Method does not exist: ' . $name);
143
  }
144
145
  /**
146
   * @param $name
147
   * @param $arguments
148
   *
149
   * @return HtmlDomParser
150
   */
151 11
  public static function __callStatic($name, $arguments)
152
  {
153 11
    if ($name == 'str_get_html') {
154 7
      $parser = new self();
155
156 7
      return $parser->loadHtml($arguments[0]);
157
    }
158
159 4
    if ($name == 'file_get_html') {
160 3
      $parser = new self();
161
162 3
      return $parser->loadHtmlFile($arguments[0]);
163
    }
164
165 1
    throw new BadMethodCallException('Method does not exist');
166
  }
167
168
  /**
169
   * @param $name
170
   *
171
   * @return string
172
   */
173 13
  public function __get($name)
174
  {
175 13
    $name = strtolower($name);
176
177
    switch ($name) {
178 13
      case 'outerhtml':
179 13
      case 'outertext':
180 7
        return $this->html();
181 6
      case 'innerhtml':
182 3
      case 'innertext':
183 4
        return $this->innerHtml();
184 2
      case 'text':
185 2
      case 'plaintext':
186 1
        return $this->text();
187
    }
188
189 1
    return null;
190
  }
191
192
  /**
193
   * @param string $selector
194
   * @param int    $idx
195
   *
196
   * @return SimpleHtmlDom|SimpleHtmlDomNode|null
197
   */
198 2
  public function __invoke($selector, $idx = null)
199
  {
200 2
    return $this->find($selector, $idx);
201
  }
202
203
  /**
204
   * @return string
205
   */
206 12
  public function __toString()
207
  {
208 12
    return $this->html();
209
  }
210
211
  /**
212
   * does nothing (only for api-compatibility-reasons)
213
   *
214
   * @return bool
215
   */
216 1
  public function clear()
217
  {
218 1
    return true;
219
  }
220
221
  /**
222
   * @param string $html
223
   *
224
   * @return string
225
   */
226 69
  private function replaceToPreserveHtmlEntities($html)
227
  {
228 69
    preg_match_all("/(\bhttps?:\/\/[^\s()<>]+(?:\([\w\d]+\)|[^[:punct:]\s]|\/|\}|\]))/i", $html, $linksOld);
229
230 69
    $linksNew = array();
231 69
    if (!empty($linksOld[1])) {
232 49
      $linksOld = $linksOld[1];
233 49
      foreach ($linksOld as $linkKey => $linkOld) {
234 49
        $linksNew[$linkKey] = str_replace(
235 49
            self::$domLinkReplaceHelper['orig'],
236 49
            self::$domLinkReplaceHelper['tmp'],
237
            $linkOld
238
        );
239
      }
240
    }
241
242 69
    $linksNewCount = count($linksNew);
243 69
    if ($linksNewCount > 0 && count($linksOld) === $linksNewCount) {
244 49
      $search = array_merge($linksOld, self::$domReplaceHelper['orig']);
245 49
      $replace = array_merge($linksNew, self::$domReplaceHelper['tmp']);
246
    } else {
247 21
      $search = self::$domReplaceHelper['orig'];
248 21
      $replace = self::$domReplaceHelper['tmp'];
249
    }
250
251 69
    return str_replace($search, $replace, $html);
252
  }
253
254
  /**
255
   * @param string $html
256
   *
257
   * @return string
258
   */
259 50
  public static function putReplacedBackToPreserveHtmlEntities($html)
260
  {
261 50
    return str_replace(
262
        array_merge(
263 50
            self::$domLinkReplaceHelper['tmp'],
264 50
            self::$domReplaceHelper['tmp'],
265 50
            array('&#13;')
266
        ),
267
        array_merge(
268 50
            self::$domLinkReplaceHelper['orig'],
269 50
            self::$domReplaceHelper['orig'],
270 50
            array('')
271
        ),
272
        $html
273
    );
274
  }
275
276
  /**
277
   * create DOMDocument from HTML
278
   *
279
   * @param string $html
280
   *
281
   * @return \DOMDocument
282
   */
283 100
  private function createDOMDocument($html)
284
  {
285 100
    if (strpos($html, '<') === false) {
286 5
      $this->isDOMDocumentCreatedWithoutHtml = true;
287
    }
288
289 100
    if (strpos($html, '<html') === false) {
290 54
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
291
    }
292
293
    // set error level
294 100
    $internalErrors = libxml_use_internal_errors(true);
295 100
    $disableEntityLoader = libxml_disable_entity_loader(true);
296 100
    libxml_clear_errors();
297
298 100
    $options = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
299 100
    if (defined(LIBXML_COMPACT)) {
300
      $options |= LIBXML_COMPACT;
301
    }
302
303 100
    $sxe = simplexml_load_string($html, 'SimpleXMLElement', $options);
304 100
    if ($sxe !== false && count(libxml_get_errors()) === 0) {
305 33
      $this->document = dom_import_simplexml($sxe)->ownerDocument;
306
    } else {
307
308
      // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
309 69
      $html = trim($html);
310 69
      $xmlHackUsed = false;
311 69
      if (stripos('<?xml', $html) !== 0) {
312 69
        $xmlHackUsed = true;
313 69
        $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
314
      }
315
316 69
      $html = $this->replaceToPreserveHtmlEntities($html);
317
318 69
      $this->document->loadHTML($html);
319
320
      // remove the "xml-encoding" hack
321 69
      if ($xmlHackUsed === true) {
322 69
        foreach ($this->document->childNodes as $child) {
323 69
          if ($child->nodeType == XML_PI_NODE) {
324 69
            $this->document->removeChild($child);
325
          }
326
        }
327
      }
328
329 69
      libxml_clear_errors();
330
    }
331
332
    // set encoding
333 100
    $this->document->encoding = $this->getEncoding();
334
335
    // restore lib-xml settings
336 100
    libxml_use_internal_errors($internalErrors);
337 100
    libxml_disable_entity_loader($disableEntityLoader);
338
339 100
    return $this->document;
340
  }
341
342
  /**
343
   * Return SimpleHtmlDom by id.
344
   *
345
   * @param string $id
346
   *
347
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
348
   */
349 2
  public function getElementById($id)
350
  {
351 2
    return $this->find("#$id", 0);
352
  }
353
354
  /**
355
   * Return SimpleHtmlDom by tag name.
356
   *
357
   * @param string $name
358
   *
359
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
360
   */
361 1
  public function getElementByTagName($name)
362
  {
363 1
    $node = $this->document->getElementsByTagName($name)->item(0);
364
365 1
    if ($node !== null) {
366 1
      return new SimpleHtmlDom($node);
367
    } else {
368
      return new SimpleHtmlDomNodeBlank();
369
    }
370
  }
371
372
  /**
373
   * Returns Elements by id
374
   *
375
   * @param string   $id
376
   * @param null|int $idx
377
   *
378
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
379
   */
380
  public function getElementsById($id, $idx = null)
381
  {
382
    return $this->find("#$id", $idx);
383
  }
384
385
  /**
386
   * Returns Elements by tag name
387
   *
388
   * @param string   $name
389
   * @param null|int $idx
390
   *
391
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
392
   */
393 3 View Code Duplication
  public function getElementsByTagName($name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
394
  {
395 3
    $nodesList = $this->document->getElementsByTagName($name);
396
397 3
    $elements = new SimpleHtmlDomNode();
398
399 3
    foreach ($nodesList as $node) {
400 3
      $elements[] = new SimpleHtmlDom($node);
401
    }
402
403 3
    if (null === $idx) {
404 2
      return $elements;
405
    } else {
406 1
      if ($idx < 0) {
407
        $idx = count($elements) + $idx;
408
      }
409
    }
410
411 1
    if (isset($elements[$idx])) {
412 1
      return $elements[$idx];
413
    } else {
414
      return new SimpleHtmlDomNodeBlank();
415
    }
416
  }
417
418
  /**
419
   * Find list of nodes with a CSS selector.
420
   *
421
   * @param string $selector
422
   * @param int    $idx
423
   *
424
   * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeBlank
425
   */
426 72
  public function find($selector, $idx = null)
427
  {
428 72
    $xPathQuery = SelectorConverter::toXPath($selector);
429
430 72
    $xPath = new DOMXPath($this->document);
431 72
    $nodesList = $xPath->query($xPathQuery);
432 72
    $elements = new SimpleHtmlDomNode();
433
434 72
    foreach ($nodesList as $node) {
435 68
      $elements[] = new SimpleHtmlDom($node);
436
    }
437
438 72
    if (null === $idx) {
439 47
      return $elements;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $elements; (voku\helper\SimpleHtmlDomNode) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\SimpleHtmlDomNodeBlank.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
440
    } else {
441 37
      if ($idx < 0) {
442 11
        $idx = count($elements) + $idx;
443
      }
444
    }
445
446 37
    if (isset($elements[$idx])) {
447 35
      return $elements[$idx];
448
    } else {
449 3
      return new SimpleHtmlDomNodeBlank();
450
    }
451
  }
452
453
  /**
454
   * @param string $content
455
   *
456
   * @return string
457
   */
458 41
  protected function fixHtmlOutput($content)
459
  {
460
    // INFO: DOMDocument will encapsulate plaintext into a paragraph tag (<p>),
461
    //          so we try to remove it here again ...
462
463 41
    if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
464 16
      $content = str_replace(
465
          array(
466 16
              "\n",
467
              "\r\n",
468
              "\r",
469
              '<simpleHtmlDomP>',
470
              '</simpleHtmlDomP>',
471
              '<body>',
472
              '</body>',
473
              '<html>',
474
              '</html>',
475
          ),
476 16
          '',
477
          $content
478
      );
479
    }
480
481 41
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
482 4
      $content = str_replace(
483
          array(
484 4
              '<p>',
485
              '</p>',
486
              '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">'
487
          ),
488 4
          '',
489
          $content);
490
    }
491
492 41
    $content = UTF8::html_entity_decode($content);
493 41
    $content = trim($content);
494 41
    $content = UTF8::urldecode($content);
495
496 41
    $content = self::putReplacedBackToPreserveHtmlEntities($content);
497
498 41
    return $content;
499
  }
500
501
  /**
502
   * @return DOMDocument
503
   */
504 35
  public function getDocument()
505
  {
506 35
    return $this->document;
507
  }
508
509
  /**
510
   * Get the encoding to use
511
   *
512
   * @return string
513
   */
514 112
  private function getEncoding()
515
  {
516 112
    return $this->encoding;
517
  }
518
519
  /**
520
   * @return bool
521
   */
522 6
  public function getIsDOMDocumentCreatedWithoutHtml()
523
  {
524 6
    return $this->isDOMDocumentCreatedWithoutHtml;
525
  }
526
527
  /**
528
   * @return bool
529
   */
530 29
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper()
531
  {
532 29
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
533
  }
534
535
  /**
536
   * Get dom node's outer html
537
   *
538
   * @return string
539
   */
540 29
  public function html()
541
  {
542 29
    if ($this::$callback !== null) {
543
      call_user_func_array($this::$callback, array($this));
544
    }
545
546 29
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
547 11
      $content = $this->document->saveHTML($this->document->documentElement);
548
    } else {
549 22
      $content = $this->document->saveHTML();
550
    }
551
552 29
    return $this->fixHtmlOutput($content);
553
  }
554
555
  /**
556
   * Get the HTML as XML.
557
   *
558
   * @return string
559
   */
560 1
  public function xml()
561
  {
562 1
    $xml = $this->document->saveXML(null, LIBXML_NOEMPTYTAG);
563
564
    // remove the XML-header
565 1
    $xml = ltrim(preg_replace('/<\?xml.*\?>/', '', $xml));
566
567 1
    return $this->fixHtmlOutput($xml);
568
  }
569
570
  /**
571
   * Get dom node's inner html
572
   *
573
   * @return string
574
   */
575 13
  public function innerHtml()
576
  {
577 13
    $text = '';
578
579 13
    foreach ($this->document->documentElement->childNodes as $node) {
580 13
      $text .= $this->fixHtmlOutput($this->document->saveHTML($node));
581
    }
582
583 13
    return $text;
584
  }
585
586
  /**
587
   * Load HTML from string
588
   *
589
   * @param string $html
590
   *
591
   * @return HtmlDomParser
592
   *
593
   * @throws InvalidArgumentException if argument is not string
594
   */
595 103
  public function loadHtml($html)
596
  {
597 103
    if (!is_string($html)) {
598 3
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
599
    }
600
601 100
    $this->document = $this->createDOMDocument($html);
602
603 100
    return $this;
604
  }
605
606
  /**
607
   * Load HTML from file
608
   *
609
   * @param string $filePath
610
   *
611
   * @return HtmlDomParser
612
   */
613 11
  public function loadHtmlFile($filePath)
614
  {
615 11
    if (!is_string($filePath)) {
616 2
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
617
    }
618
619 9
    if (!preg_match("/^https?:\/\//i", $filePath) && !file_exists($filePath)) {
620 1
      throw new RuntimeException("File $filePath not found");
621
    }
622
623
    try {
624 8
      $html = file_get_contents($filePath);
625
626 1
    } catch (\Exception $e) {
627 1
      throw new RuntimeException("Could not load file $filePath");
628
    }
629
630 7
    if ($html === false) {
631
      throw new RuntimeException("Could not load file $filePath");
632
    }
633
634 7
    $this->loadHtml($html);
635
636 7
    return $this;
637
  }
638
639
  /**
640
   * Save dom as string
641
   *
642
   * @param string $filepath
643
   *
644
   * @return string
645
   */
646 1
  public function save($filepath = '')
647
  {
648 1
    $string = $this->innerHtml();
649 1
    if ($filepath !== '') {
650
      file_put_contents($filepath, $string, LOCK_EX);
651
    }
652
653 1
    return $string;
654
  }
655
656
  /**
657
   * @param $functionName
658
   */
659
  public function set_callback($functionName)
660
  {
661
    $this::$callback = $functionName;
662
  }
663
664
  /**
665
   * Get dom node's plain text
666
   *
667
   * @return string
668
   */
669 1
  public function text()
670
  {
671 1
    return $this->fixHtmlOutput($this->document->textContent);
672
  }
673
}
674