Completed
Push — master ( 28f10e...2739b6 )
by Lars
02:58
created

HtmlDomParser::getEncoding()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 4
ccs 2
cts 2
cp 1
rs 10
cc 1
eloc 2
nc 1
nop 0
crap 1
1
<?php
2
3
namespace voku\helper;
4
5
use BadMethodCallException;
6
use DOMDocument;
7
use DOMXPath;
8
use InvalidArgumentException;
9
use RuntimeException;
10
11
/**
12
 * Class HtmlDomParser
13
 *
14
 * @package voku\helper
15
 *
16
 * @property-read string outerText Get dom node's outer html (alias for "outerHtml")
17
 * @property-read string outerHtml Get dom node's outer html
18
 * @property-read string innerText Get dom node's inner html (alias for "innerHtml")
19
 * @property-read string innerHtml Get dom node's inner html
20
 * @property-read string plaintext Get dom node's plain text
21
 *
22
 * @method string outerText() Get dom node's outer html (alias for "outerHtml()")
23
 * @method string outerHtml() Get dom node's outer html
24
 * @method string innerText() Get dom node's inner html (alias for "innerHtml()")
25
 * @method HtmlDomParser load() load($html) Load HTML from string
26
 * @method HtmlDomParser load_file() load_file($html) Load HTML from file
27
 *
28
 * @method static HtmlDomParser file_get_html() file_get_html($html) Load HTML from file
29
 * @method static HtmlDomParser str_get_html() str_get_html($html) Load HTML from string
30
 */
31
class HtmlDomParser
32
{
33
  /**
34
   * @var array
35
   */
36
  protected static $functionAliases = array(
37
      'outertext' => 'html',
38
      'outerhtml' => 'html',
39
      'innertext' => 'innerHtml',
40
      'innerhtml' => 'innerHtml',
41
      'load'      => 'loadHtml',
42
      'load_file' => 'loadHtmlFile',
43
  );
44
45
  /**
46
   * @var array
47
   */
48
  private static $domLinkReplaceHelper = array(
49
      'orig' => array('[', ']', '{', '}',),
50
      'tmp'  => array(
51
          '!!!!HTML_DOM__SQUARE_BRACKET_LEFT!!!!',
52
          '!!!!HTML_DOM__SQUARE_BRACKET_RIGHT!!!!',
53
          '!!!!HTML_DOM__BRACKET_LEFT!!!!',
54
          '!!!!HTML_DOM__BRACKET_RIGHT!!!!',
55
      ),
56
  );
57
58
  /**
59
   * @var array
60
   */
61
  protected static $domReplaceHelper = array(
62
      'orig' => array('&', '|', '+', '%'),
63
      'tmp'  => array(
64
          '!!!!HTML_DOM__AMP!!!!',
65
          '!!!!HTML_DOM__PIPE!!!!',
66
          '!!!!HTML_DOM__PLUS!!!!',
67
          '!!!!HTML_DOM__PERCENT!!!!',
68
      ),
69
  );
70
71
  /**
72
   * @var Callable
73
   */
74
  protected static $callback;
75
76
  /**
77
   * @var DOMDocument
78
   */
79
  protected $document;
80
81
  /**
82
   * @var string
83
   */
84
  protected $encoding = 'UTF-8';
85
86
  /**
87
   * @var bool
88
   */
89
  protected $isDOMDocumentCreatedWithoutHtml = false;
90
91
  /**
92
   * @var bool
93
   */
94
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
95
96
  /**
97
   * Constructor
98
   *
99
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
100
   */
101 113
  public function __construct($element = null)
102
  {
103 113
    $this->document = new \DOMDocument('1.0', $this->getEncoding());
104
105
    // DOMDocument settings
106 113
    $this->document->preserveWhiteSpace = false;
107 113
    $this->document->formatOutput = true;
108
109 113
    if ($element instanceof SimpleHtmlDom) {
110 51
      $element = $element->getNode();
111 51
    }
112
113 113
    if ($element instanceof \DOMNode) {
114 51
      $domNode = $this->document->importNode($element, true);
115
116 51
      if ($domNode instanceof \DOMNode) {
117 51
        $this->document->appendChild($domNode);
118 51
      }
119
120 51
      return;
121
    }
122
123 113
    if ($element !== null) {
124 69
      $this->loadHtml($element);
125 68
    }
126 112
  }
127
128
  /**
129
   * @param $name
130
   * @param $arguments
131
   *
132
   * @return bool|mixed
133
   */
134 32
  public function __call($name, $arguments)
135
  {
136 32
    $name = strtolower($name);
137
138 32 View Code Duplication
    if (isset(self::$functionAliases[$name])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
139 31
      return call_user_func_array(array($this, self::$functionAliases[$name]), $arguments);
140
    }
141
142 1
    throw new BadMethodCallException('Method does not exist: ' . $name);
143
  }
144
145
  /**
146
   * @param $name
147
   * @param $arguments
148
   *
149
   * @return HtmlDomParser
150
   */
151 11
  public static function __callStatic($name, $arguments)
152
  {
153 11
    if ($name == 'str_get_html') {
154 7
      $parser = new self();
155
156 7
      return $parser->loadHtml($arguments[0]);
157
    }
158
159 4
    if ($name == 'file_get_html') {
160 3
      $parser = new self();
161
162 3
      return $parser->loadHtmlFile($arguments[0]);
163
    }
164
165 1
    throw new BadMethodCallException('Method does not exist');
166
  }
167
168
  /**
169
   * @param $name
170
   *
171
   * @return string
172
   */
173 13
  public function __get($name)
174
  {
175 13
    $name = strtolower($name);
176
177
    switch ($name) {
178 13
      case 'outerhtml':
179 13
      case 'outertext':
180 7
        return $this->html();
181 6
      case 'innerhtml':
182 6
      case 'innertext':
183 4
        return $this->innerHtml();
184 2
      case 'text':
185 2
      case 'plaintext':
186 1
        return $this->text();
187
    }
188
189 1
    return null;
190
  }
191
192
  /**
193
   * @param string $selector
194
   * @param int    $idx
195
   *
196
   * @return SimpleHtmlDom|SimpleHtmlDomNode|null
197
   */
198 2
  public function __invoke($selector, $idx = null)
199
  {
200 2
    return $this->find($selector, $idx);
201
  }
202
203
  /**
204
   * @return string
205
   */
206 13
  public function __toString()
207
  {
208 13
    return $this->html();
209
  }
210
211
  /**
212
   * does nothing (only for api-compatibility-reasons)
213
   *
214
   * @return bool
215
   */
216 1
  public function clear()
217
  {
218 1
    return true;
219
  }
220
221
  /**
222
   * @param string $html
223
   *
224
   * @return string
225
   */
226 69
  private function replaceToPreserveHtmlEntities($html)
227
  {
228 69
    preg_match_all("/(\bhttps?:\/\/[^\s()<>]+(?:\([\w\d]+\)|[^[:punct:]\s]|\/|\}|\]))/i", $html, $linksOld);
229
230 69
    $linksNew = array();
231 69
    if (!empty($linksOld[1])) {
232 49
      $linksOld = $linksOld[1];
233 49
      foreach ($linksOld as $linkKey => $linkOld) {
234 49
        $linksNew[$linkKey] = str_replace(
235 49
            self::$domLinkReplaceHelper['orig'],
236 49
            self::$domLinkReplaceHelper['tmp'],
237
            $linkOld
238 49
        );
239 49
      }
240 49
    }
241
242 69
    $linksNewCount = count($linksNew);
243 69
    if ($linksNewCount > 0 && count($linksOld) === $linksNewCount) {
244 49
      $search = array_merge($linksOld, self::$domReplaceHelper['orig']);
245 49
      $replace = array_merge($linksNew, self::$domReplaceHelper['tmp']);
246 49
    } else {
247 21
      $search = self::$domReplaceHelper['orig'];
248 21
      $replace = self::$domReplaceHelper['tmp'];
249
    }
250
251 69
    return str_replace($search, $replace, $html);
252
  }
253
254
  /**
255
   * @param string $html
256
   *
257
   * @return string
258
   */
259 51
  public static function putReplacedBackToPreserveHtmlEntities($html)
260
  {
261 51
    return str_replace(
262 51
        array_merge(
263 51
            self::$domLinkReplaceHelper['tmp'],
264 51
            self::$domReplaceHelper['tmp'],
265 51
            array('&#13;')
266 51
        ),
267 51
        array_merge(
268 51
            self::$domLinkReplaceHelper['orig'],
269 51
            self::$domReplaceHelper['orig'],
270 51
            array('')
271 51
        ),
272
        $html
273 51
    );
274
  }
275
276
  /**
277
   * create DOMDocument from HTML
278
   *
279
   * @param string $html
280
   *
281
   * @return \DOMDocument
282
   */
283 101
  private function createDOMDocument($html)
284
  {
285 101
    if (strpos($html, '<') === false) {
286 5
      $this->isDOMDocumentCreatedWithoutHtml = true;
287 5
    }
288
289 101
    if (strpos($html, '<html') === false) {
290 55
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
291 55
    }
292
293
    // set error level
294 101
    $internalErrors = libxml_use_internal_errors(true);
295 101
    $disableEntityLoader = libxml_disable_entity_loader(true);
296 101
    libxml_clear_errors();
297
298 101
    $options = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
299 101
    if (defined(LIBXML_COMPACT)) {
300
      $options |= LIBXML_COMPACT;
301
    }
302
303 101
    $sxe = simplexml_load_string($html, 'SimpleXMLElement', $options);
304 101
    if ($sxe !== false && count(libxml_get_errors()) === 0) {
305 34
      $this->document = dom_import_simplexml($sxe)->ownerDocument;
306 34
    } else {
307
308
      // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
309 69
      $html = trim($html);
310 69
      $xmlHackUsed = false;
311 69
      if (stripos('<?xml', $html) !== 0) {
312 69
        $xmlHackUsed = true;
313 69
        $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
314 69
      }
315
316 69
      $html = $this->replaceToPreserveHtmlEntities($html);
317
318 69
      $this->document->loadHTML($html);
319
320
      // remove the "xml-encoding" hack
321 69
      if ($xmlHackUsed === true) {
322 69
        foreach ($this->document->childNodes as $child) {
323 69
          if ($child->nodeType == XML_PI_NODE) {
324 69
            $this->document->removeChild($child);
325 69
          }
326 69
        }
327 69
      }
328
329 69
      libxml_clear_errors();
330
    }
331
332
    // set encoding
333 101
    $this->document->encoding = $this->getEncoding();
334
335
    // restore lib-xml settings
336 101
    libxml_use_internal_errors($internalErrors);
337 101
    libxml_disable_entity_loader($disableEntityLoader);
338
339 101
    return $this->document;
340
  }
341
342
  /**
343
   * Return SimpleHtmlDom by id.
344
   *
345
   * @param string $id
346
   *
347
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
348
   */
349 2
  public function getElementById($id)
350
  {
351 2
    return $this->find("#$id", 0);
352
  }
353
354
  /**
355
   * Return SimpleHtmlDom by tag name.
356
   *
357
   * @param string $name
358
   *
359
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
360
   */
361 1
  public function getElementByTagName($name)
362
  {
363 1
    $node = $this->document->getElementsByTagName($name)->item(0);
364
365 1
    if ($node !== null) {
366 1
      return new SimpleHtmlDom($node);
367
    } else {
368
      return new SimpleHtmlDomNodeBlank();
369
    }
370
  }
371
372
  /**
373
   * Returns Elements by id
374
   *
375
   * @param string   $id
376
   * @param null|int $idx
377
   *
378
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
379
   */
380
  public function getElementsById($id, $idx = null)
381
  {
382
    return $this->find("#$id", $idx);
383
  }
384
385
  /**
386
   * Returns Elements by tag name
387
   *
388
   * @param string   $name
389
   * @param null|int $idx
390
   *
391
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
392
   */
393 3 View Code Duplication
  public function getElementsByTagName($name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
394
  {
395 3
    $nodesList = $this->document->getElementsByTagName($name);
396
397 3
    $elements = new SimpleHtmlDomNode();
398
399 3
    foreach ($nodesList as $node) {
400 3
      $elements[] = new SimpleHtmlDom($node);
401 3
    }
402
403 3
    if (null === $idx) {
404 2
      return $elements;
405
    } else {
406 1
      if ($idx < 0) {
407
        $idx = count($elements) + $idx;
408
      }
409
    }
410
411 1
    if (isset($elements[$idx])) {
412 1
      return $elements[$idx];
413
    } else {
414
      return new SimpleHtmlDomNodeBlank();
415
    }
416
  }
417
418
  /**
419
   * Find list of nodes with a CSS selector.
420
   *
421
   * @param string $selector
422
   * @param int    $idx
423
   *
424
   * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeBlank
425
   */
426 73
  public function find($selector, $idx = null)
427
  {
428 73
    $xPathQuery = SelectorConverter::toXPath($selector);
429
430 73
    $xPath = new DOMXPath($this->document);
431 73
    $nodesList = $xPath->query($xPathQuery);
432 73
    $elements = new SimpleHtmlDomNode();
433
434 73
    foreach ($nodesList as $node) {
435 69
      $elements[] = new SimpleHtmlDom($node);
436 73
    }
437
438 73
    if (null === $idx) {
439 48
      return $elements;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $elements; (voku\helper\SimpleHtmlDomNode) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\SimpleHtmlDomNodeBlank.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
440
    } else {
441 37
      if ($idx < 0) {
442 11
        $idx = count($elements) + $idx;
443 11
      }
444
    }
445
446 37
    if (isset($elements[$idx])) {
447 35
      return $elements[$idx];
448
    } else {
449 3
      return new SimpleHtmlDomNodeBlank();
450
    }
451
  }
452
453
  /**
454
   * @param string $content
455
   *
456
   * @return string
457
   */
458 42
  protected function fixHtmlOutput($content)
459
  {
460
    // INFO: DOMDocument will encapsulate plaintext into a paragraph tag (<p>),
461
    //          so we try to remove it here again ...
462
463 42
    if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
464 17
      $content = str_replace(
465
          array(
466 17
              "\n",
467 17
              "\r\n",
468 17
              "\r",
469 17
              '<simpleHtmlDomP>',
470 17
              '</simpleHtmlDomP>',
471 17
              '<body>',
472 17
              '</body>',
473 17
              '<html>',
474 17
              '</html>',
475 17
          ),
476 17
          '',
477
          $content
478 17
      );
479 17
    }
480
481 42
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
482 4
      $content = str_replace(
483
          array(
484 4
              '<p>',
485 4
              '</p>',
486
              '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">'
487 4
          ),
488 4
          '',
489 4
          $content);
490 4
    }
491
492 42
    $content = UTF8::html_entity_decode($content);
493 42
    $content = trim($content);
494 42
    $content = UTF8::urldecode($content);
495
496 42
    $content = self::putReplacedBackToPreserveHtmlEntities($content);
497
498 42
    return $content;
499
  }
500
501
  /**
502
   * @return DOMDocument
503
   */
504 35
  public function getDocument()
505
  {
506 35
    return $this->document;
507
  }
508
509
  /**
510
   * Get the encoding to use
511
   *
512
   * @return string
513
   */
514 113
  private function getEncoding()
515
  {
516 113
    return $this->encoding;
517
  }
518
519
  /**
520
   * @return bool
521
   */
522 6
  public function getIsDOMDocumentCreatedWithoutHtml()
523
  {
524 6
    return $this->isDOMDocumentCreatedWithoutHtml;
525
  }
526
527
  /**
528
   * @return bool
529
   */
530 30
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper()
531
  {
532 30
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
533
  }
534
535
  /**
536
   * Get dom node's outer html
537
   *
538
   * @return string
539
   */
540 30
  public function html()
541
  {
542 30
    if ($this::$callback !== null) {
543
      call_user_func_array($this::$callback, array($this));
544
    }
545
546 30
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
547 12
      $content = $this->document->saveHTML($this->document->documentElement);
548 12
    } else {
549 22
      $content = $this->document->saveHTML();
550
    }
551
552 30
    return $this->fixHtmlOutput($content);
553
  }
554
555
  /**
556
   * Get the HTML as XML.
557
   *
558
   * @return string
559
   */
560 1
  public function xml()
561
  {
562 1
    $xml = $this->document->saveXML(null, LIBXML_NOEMPTYTAG);
563
564
    // remove the XML-header
565 1
    $xml = ltrim(preg_replace('/<\?xml.*\?>/', '', $xml));
566
567 1
    return $this->fixHtmlOutput($xml);
568
  }
569
570
  /**
571
   * Get dom node's inner html
572
   *
573
   * @return string
574
   */
575 13
  public function innerHtml()
576
  {
577 13
    $text = '';
578
579 13
    foreach ($this->document->documentElement->childNodes as $node) {
580 13
      $text .= $this->fixHtmlOutput($this->document->saveHTML($node));
581 13
    }
582
583 13
    return $text;
584
  }
585
586
  /**
587
   * Load HTML from string
588
   *
589
   * @param string $html
590
   *
591
   * @return HtmlDomParser
592
   *
593
   * @throws InvalidArgumentException if argument is not string
594
   */
595 104
  public function loadHtml($html)
596
  {
597 104
    if (!is_string($html)) {
598 3
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
599
    }
600
601 101
    $this->document = $this->createDOMDocument($html);
602
603 101
    return $this;
604
  }
605
606
  /**
607
   * Load HTML from file
608
   *
609
   * @param string $filePath
610
   *
611
   * @return HtmlDomParser
612
   */
613 11
  public function loadHtmlFile($filePath)
614
  {
615 11
    if (!is_string($filePath)) {
616 2
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
617
    }
618
619 9
    if (!preg_match("/^https?:\/\//i", $filePath) && !file_exists($filePath)) {
620 1
      throw new RuntimeException("File $filePath not found");
621
    }
622
623
    try {
624 8
      $html = file_get_contents($filePath);
625
626 8
    } catch (\Exception $e) {
627 1
      throw new RuntimeException("Could not load file $filePath");
628
    }
629
630 7
    if ($html === false) {
631
      throw new RuntimeException("Could not load file $filePath");
632
    }
633
634 7
    $this->loadHtml($html);
635
636 7
    return $this;
637
  }
638
639
  /**
640
   * Save dom as string
641
   *
642
   * @param string $filepath
643
   *
644
   * @return string
645
   */
646 1
  public function save($filepath = '')
647
  {
648 1
    $string = $this->innerHtml();
649 1
    if ($filepath !== '') {
650
      file_put_contents($filepath, $string, LOCK_EX);
651
    }
652
653 1
    return $string;
654
  }
655
656
  /**
657
   * @param $functionName
658
   */
659
  public function set_callback($functionName)
660
  {
661
    $this::$callback = $functionName;
662
  }
663
664
  /**
665
   * Get dom node's plain text
666
   *
667
   * @return string
668
   */
669 1
  public function text()
670
  {
671 1
    return $this->fixHtmlOutput($this->document->textContent);
672
  }
673
}
674