Completed
Push — master ( c60398...f8ae2f )
by Lars
03:00
created

HtmlDomParser::clear()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 4
ccs 2
cts 2
cp 1
rs 10
cc 1
eloc 2
nc 1
nop 0
crap 1
1
<?php
2
3
namespace voku\helper;
4
5
use BadMethodCallException;
6
use DOMDocument;
7
use DOMXPath;
8
use InvalidArgumentException;
9
use RuntimeException;
10
11
/**
12
 * Class HtmlDomParser
13
 *
14
 * @package voku\helper
15
 *
16
 * @property string      outertext Get dom node's outer html
17
 * @property string      innertext Get dom node's inner html
18
 * @property-read string plaintext Get dom node's plain text
19
 *
20
 * @method string outertext() Get dom node's outer html
21
 * @method string innertext() Get dom node's inner html
22
 * @method HtmlDomParser load() load($html) Load HTML from string
23
 * @method HtmlDomParser load_file() load_file($html) Load HTML from file
24
 *
25
 * @method static HtmlDomParser file_get_html() file_get_html($html) Load HTML from file
26
 * @method static HtmlDomParser str_get_html() str_get_html($html) Load HTML from string
27
 */
28
class HtmlDomParser
29
{
30
  /**
31
   * @var array
32
   */
33
  protected static $functionAliases = array(
34
      'outertext' => 'html',
35
      'innertext' => 'innerHtml',
36
      'load'      => 'loadHtml',
37
      'load_file' => 'loadHtmlFile',
38
  );
39
40
  /**
41
   * @var array
42
   */
43
  private static $domLinkReplaceHelper = array(
44
      'orig' => array('[', ']', '{', '}',),
45
      'tmp'  => array(
46
          '!!!!HTML_DOM__SQUARE_BRACKET_LEFT!!!!',
47
          '!!!!HTML_DOM__SQUARE_BRACKET_RIGHT!!!!',
48
          '!!!!HTML_DOM__BRACKET_LEFT!!!!',
49
          '!!!!HTML_DOM__BRACKET_RIGHT!!!!',
50
      ),
51
  );
52
53
  /**
54
   * @var array
55
   */
56
  protected static $domReplaceHelper = array(
57
      'orig' => array('&', '|', '+', '%'),
58
      'tmp'  => array(
59
          '!!!!HTML_DOM__AMP!!!!',
60
          '!!!!HTML_DOM__PIPE!!!!',
61
          '!!!!HTML_DOM__PLUS!!!!',
62
          '!!!!HTML_DOM__PERCENT!!!!',
63
      ),
64
  );
65
66
  /**
67
   * @var Callable
68
   */
69
  protected static $callback;
70
71
  /**
72
   * @var DOMDocument
73
   */
74
  protected $document;
75
76
  /**
77
   * @var string
78
   */
79
  protected $encoding = 'UTF-8';
80
81
  /**
82
   * @var bool
83
   */
84
  protected $isDOMDocumentCreatedWithoutHtml = false;
85
86
  /**
87
   * @var bool
88
   */
89
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
90
91
  /**
92
   * Constructor
93
   *
94
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
95
   */
96 85
  public function __construct($element = null)
97
  {
98 85
    $this->document = new \DOMDocument('1.0', $this->getEncoding());
99
100
    // DOMDocument settings
101 85
    $this->document->preserveWhiteSpace = false;
102 85
    $this->document->formatOutput = true;
103
104 85
    if ($element instanceof SimpleHtmlDom) {
105 40
      $element = $element->getNode();
106
    }
107
108 85
    if ($element instanceof \DOMNode) {
109 40
      $domNode = $this->document->importNode($element, true);
110
111 40
      if ($domNode instanceof \DOMNode) {
112 40
        $this->document->appendChild($domNode);
113
      }
114
115 40
      return;
116
    }
117
118 85
    if ($element !== null) {
119 69
      $this->loadHtml($element);
120
    }
121 84
  }
122
123
  /**
124
   * @param $name
125
   * @param $arguments
126
   *
127
   * @return bool|mixed
128
   */
129 6 View Code Duplication
  public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
130
  {
131 6
    if (isset(self::$functionAliases[$name])) {
132 5
      return call_user_func_array(array($this, self::$functionAliases[$name]), $arguments);
133
    }
134
135 1
    throw new BadMethodCallException('Method does not exist: ' . $name);
136
  }
137
138
  /**
139
   * @param $name
140
   * @param $arguments
141
   *
142
   * @return HtmlDomParser
143
   */
144 11
  public static function __callStatic($name, $arguments)
145
  {
146 11
    if ($name == 'str_get_html') {
147 7
      $parser = new self();
148
149 7
      return $parser->loadHtml($arguments[0]);
150
    }
151
152 4
    if ($name == 'file_get_html') {
153 3
      $parser = new self();
154
155 3
      return $parser->loadHtmlFile($arguments[0]);
156
    }
157
158 1
    throw new BadMethodCallException('Method does not exist');
159
  }
160
161
  /**
162
   * @param $name
163
   *
164
   * @return string
165
   */
166 10 View Code Duplication
  public function __get($name)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
167
  {
168
    switch ($name) {
169 10
      case 'outertext':
170 7
        return $this->html();
171 3
      case 'innertext':
172 1
        return $this->innerHtml();
173 2
      case 'plaintext':
174 1
        return $this->text();
175
    }
176
177 1
    return null;
178
  }
179
180
  /**
181
   * @param string $selector
182
   * @param int    $idx
183
   *
184
   * @return SimpleHtmlDom|SimpleHtmlDomNode|null
185
   */
186 2
  public function __invoke($selector, $idx = null)
187
  {
188 2
    return $this->find($selector, $idx);
189
  }
190
191
  /**
192
   * @return string
193
   */
194 8
  public function __toString()
195
  {
196 8
    return $this->html();
197
  }
198
199
  /**
200
   * does nothing (only for api-compatibility-reasons)
201
   *
202
   * @return bool
203
   */
204 1
  public function clear()
205
  {
206 1
    return true;
207
  }
208
209
  /**
210
   * @param string $html
211
   *
212
   * @return string
213
   */
214 49
  private function replaceToPreserveHtmlEntities($html)
215
  {
216 49
    preg_match_all("/(\bhttps?:\/\/[^\s()<>]+(?:\([\w\d]+\)|[^[:punct:]\s]|\/|\}|\]))/i", $html, $linksOld);
217
218 49
    $linksNew = array();
219 49
    if (!empty($linksOld[1])) {
220 45
      $linksOld = $linksOld[1];
221 45
      foreach ($linksOld as $linkKey => $linkOld) {
222 45
        $linksNew[$linkKey] = str_replace(
223 45
            self::$domLinkReplaceHelper['orig'],
224 45
            self::$domLinkReplaceHelper['tmp'],
225
            $linkOld
226
        );
227
      }
228
    }
229
230 49
    $linksNewCount = count($linksNew);
231 49
    if ($linksNewCount > 0 && count($linksOld) === $linksNewCount) {
232 45
      $search = array_merge($linksOld, self::$domReplaceHelper['orig']);
233 45
      $replace = array_merge($linksNew, self::$domReplaceHelper['tmp']);
234
    } else {
235 5
      $search = self::$domReplaceHelper['orig'];
236 5
      $replace = self::$domReplaceHelper['tmp'];
237
    }
238
239 49
    return str_replace($search, $replace, $html);
240
  }
241
242
  /**
243
   * @param string $html
244
   *
245
   * @return string
246
   */
247 30
  public static function putReplacedBackToPreserveHtmlEntities($html)
248
  {
249 30
    return str_replace(
250
        array_merge(
251 30
            self::$domLinkReplaceHelper['tmp'],
252 30
            self::$domReplaceHelper['tmp'],
253 30
            array('&#13;')
254
        ),
255
        array_merge(
256 30
            self::$domLinkReplaceHelper['orig'],
257 30
            self::$domReplaceHelper['orig'],
258 30
            array('')
259
        ),
260
        $html
261
    );
262
  }
263
264
  /**
265
   * create DOMDocument from HTML
266
   *
267
   * @param string $html
268
   *
269
   * @return \DOMDocument
270
   */
271 74
  private function createDOMDocument($html)
272
  {
273 74
    if (strpos($html, '<') === false) {
274 4
      $this->isDOMDocumentCreatedWithoutHtml = true;
275
    }
276
277 74
    if (strpos($html, '<html') === false) {
278 29
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
279
    }
280
281
    // set error level
282 74
    $internalErrors = libxml_use_internal_errors(true);
283 74
    $disableEntityLoader = libxml_disable_entity_loader(true);
284 74
    libxml_clear_errors();
285
286 74
    $options = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
287 74
    if (defined(LIBXML_COMPACT)) {
288
      $options |= LIBXML_COMPACT;
289
    }
290
291 74
    $sxe = simplexml_load_string($html, 'SimpleXMLElement', $options);
292 74
    if ($sxe !== false && count(libxml_get_errors()) === 0) {
293 27
      $this->document = dom_import_simplexml($sxe)->ownerDocument;
294
    } else {
295
296
      // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
297 49
      $html = trim($html);
298 49
      $xmlHackUsed = false;
299 49
      if (stripos('<?xml', $html) !== 0) {
300 49
        $xmlHackUsed = true;
301 49
        $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
302
      }
303
304 49
      $html = $this->replaceToPreserveHtmlEntities($html);
305
306 49
      $this->document->loadHTML($html);
307
308
      // remove the "xml-encoding" hack
309 49
      if ($xmlHackUsed === true) {
310 49
        foreach ($this->document->childNodes as $child) {
311 49
          if ($child->nodeType == XML_PI_NODE) {
312 49
            $this->document->removeChild($child);
313
          }
314
        }
315
      }
316
317 49
      libxml_clear_errors();
318
    }
319
320
    // set encoding
321 74
    $this->document->encoding = $this->getEncoding();
322
323
    // restore lib-xml settings
324 74
    libxml_use_internal_errors($internalErrors);
325 74
    libxml_disable_entity_loader($disableEntityLoader);
326
327 74
    return $this->document;
328
  }
329
330
  /**
331
   * Return SimpleHtmlDom by id.
332
   *
333
   * @param string $id
334
   *
335
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
336
   */
337 1
  public function getElementById($id)
338
  {
339 1
    return $this->find("#$id", 0);
340
  }
341
342
  /**
343
   * Return SimpleHtmlDom by tag name.
344
   *
345
   * @param string $name
346
   *
347
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
348
   */
349 1
  public function getElementByTagName($name)
350
  {
351 1
    $node = $this->document->getElementsByTagName($name)->item(0);
352
353 1
    if ($node !== null) {
354 1
      return new SimpleHtmlDom($node);
355
    } else {
356
      return new SimpleHtmlDomNodeBlank();
357
    }
358
  }
359
360
  /**
361
   * Returns Elements by id
362
   *
363
   * @param string   $id
364
   * @param null|int $idx
365
   *
366
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
367
   */
368
  public function getElementsById($id, $idx = null)
369
  {
370
    return $this->find("#$id", $idx);
371
  }
372
373
  /**
374
   * Returns Elements by tag name
375
   *
376
   * @param string   $name
377
   * @param null|int $idx
378
   *
379
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
380
   */
381 1 View Code Duplication
  public function getElementsByTagName($name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
382
  {
383 1
    $nodesList = $this->document->getElementsByTagName($name);
384
385 1
    $elements = new SimpleHtmlDomNode();
386
387 1
    foreach ($nodesList as $node) {
388 1
      $elements[] = new SimpleHtmlDom($node);
389
    }
390
391 1
    if (null === $idx) {
392
      return $elements;
393
    } else {
394 1
      if ($idx < 0) {
395
        $idx = count($elements) + $idx;
396
      }
397
    }
398
399 1
    if (isset($elements[$idx])) {
400 1
      return $elements[$idx];
401
    } else {
402
      return new SimpleHtmlDomNodeBlank();
403
    }
404
  }
405
406
  /**
407
   * Find list of nodes with a CSS selector.
408
   *
409
   * @param string $selector
410
   * @param int    $idx
411
   *
412
   * @return SimpleHtmlDom|SimpleHtmlDom[]
413
   */
414 53
  public function find($selector, $idx = null)
415
  {
416 53
    $xPathQuery = SelectorConverter::toXPath($selector);
417
418 53
    $xPath = new DOMXPath($this->document);
419 53
    $nodesList = $xPath->query($xPathQuery);
420 53
    $elements = new SimpleHtmlDomNode();
421
422 53
    foreach ($nodesList as $node) {
423 51
      $elements[] = new SimpleHtmlDom($node);
424
    }
425
426 53
    if (null === $idx) {
427 45
      return $elements;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $elements; (voku\helper\SimpleHtmlDomNode) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\helper\SimpleHtmlDom[].

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
428
    } else {
429 20
      if ($idx < 0) {
430 11
        $idx = count($elements) + $idx;
431
      }
432
    }
433
434 20
    if (isset($elements[$idx])) {
435 20
      return $elements[$idx];
436
    } else {
437
      return new SimpleHtmlDomNodeBlank();
0 ignored issues
show
Bug Best Practice introduced by
The return type of return new \voku\helper\SimpleHtmlDomNodeBlank(); (voku\helper\SimpleHtmlDomNodeBlank) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\helper\SimpleHtmlDom[].

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
438
    }
439
  }
440
441
  /**
442
   * @param string $content
443
   *
444
   * @return string
445
   */
446 23
  protected function fixHtmlOutput($content)
447
  {
448
    // INFO: DOMDocument will encapsulate plaintext into a paragraph tag (<p>),
449
    //          so we try to remove it here again ...
450
451 23
    if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
452 9
      $content = str_replace(
453
          array(
454 9
              "\n",
455
              "\r\n",
456
              "\r",
457
              '<simpleHtmlDomP>',
458
              '</simpleHtmlDomP>',
459
              '<body>',
460
              '</body>',
461
              '<html>',
462
              '</html>',
463
          ),
464 9
          '',
465
          $content
466
      );
467
    }
468
469 23
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
470 4
      $content = str_replace(
471
          array(
472 4
              '<p>',
473
              '</p>',
474
              '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">'
475
          ),
476 4
          '',
477
          $content);
478
    }
479
480 23
    $content = UTF8::html_entity_decode($content);
481 23
    $content = trim($content);
482 23
    $content = UTF8::urldecode($content);
483
484 23
    $content = self::putReplacedBackToPreserveHtmlEntities($content);
485
486 23
    return $content;
487
  }
488
489
  /**
490
   * @return DOMDocument
491
   */
492 35
  public function getDocument()
493
  {
494 35
    return $this->document;
495
  }
496
497
  /**
498
   * Get the encoding to use
499
   *
500
   * @return string
501
   */
502 85
  private function getEncoding()
503
  {
504 85
    return $this->encoding;
505
  }
506
507
  /**
508
   * @return bool
509
   */
510 6
  public function getIsDOMDocumentCreatedWithoutHtml()
511
  {
512 6
    return $this->isDOMDocumentCreatedWithoutHtml;
513
  }
514
515
  /**
516
   * @return bool
517
   */
518 19
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper()
519
  {
520 19
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
521
  }
522
523
  /**
524
   * Get dom node's outer html
525
   *
526
   * @return string
527
   */
528 19
  public function html()
529
  {
530 19
    if ($this::$callback !== null) {
531
      call_user_func_array($this::$callback, array($this));
532
    }
533
534 19
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
535 7
      $content = $this->document->saveHTML($this->document->documentElement);
536
    } else {
537 16
      $content = $this->document->saveHTML();
538
    }
539
540 19
    return $this->fixHtmlOutput($content);
541
  }
542
543
  /**
544
   * Get the HTML as XML.
545
   *
546
   * @return string
547
   */
548 1
  public function xml()
549
  {
550 1
    $xml = $this->document->saveXML(null, LIBXML_NOEMPTYTAG);
551
552
    // remove the XML-header
553 1
    $xml = ltrim(preg_replace('/<\?xml.*\?>/', '', $xml));
554
555 1
    return $this->fixHtmlOutput($xml);
556
  }
557
558
  /**
559
   * Get dom node's inner html
560
   *
561
   * @return string
562
   */
563 5
  public function innerHtml()
564
  {
565 5
    $text = '';
566
567 5
    foreach ($this->document->documentElement->childNodes as $node) {
568 5
      $text .= $this->fixHtmlOutput($this->document->saveHTML($node));
569
    }
570
571 5
    return $text;
572
  }
573
574
  /**
575
   * Load HTML from string
576
   *
577
   * @param string $html
578
   *
579
   * @return HtmlDomParser
580
   *
581
   * @throws InvalidArgumentException if argument is not string
582
   */
583 77
  public function loadHtml($html)
584
  {
585 77
    if (!is_string($html)) {
586 3
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
587
    }
588
589 74
    $this->document = $this->createDOMDocument($html);
590
591 74
    return $this;
592
  }
593
594
  /**
595
   * Load HTML from file
596
   *
597
   * @param string $filePath
598
   *
599
   * @return HtmlDomParser
600
   */
601 7
  public function loadHtmlFile($filePath)
602
  {
603 7
    if (!is_string($filePath)) {
604 2
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
605
    }
606
607 5
    if (!preg_match("/^https?:\/\//i", $filePath) && !file_exists($filePath)) {
608 1
      throw new RuntimeException("File $filePath not found");
609
    }
610
611
    try {
612 4
      $html = file_get_contents($filePath);
613
614 1
    } catch (\Exception $e) {
615 1
      throw new RuntimeException("Could not load file $filePath");
616
    }
617
618 3
    if ($html === false) {
619
      throw new RuntimeException("Could not load file $filePath");
620
    }
621
622 3
    $this->loadHtml($html);
623
624 3
    return $this;
625
  }
626
627
  /**
628
   * Save dom as string
629
   *
630
   * @param string $filepath
631
   *
632
   * @return string
633
   */
634 1
  public function save($filepath = '')
635
  {
636 1
    $string = $this->innerHtml();
637 1
    if ($filepath !== '') {
638
      file_put_contents($filepath, $string, LOCK_EX);
639
    }
640
641 1
    return $string;
642
  }
643
644
  /**
645
   * @param $functionName
646
   */
647
  public function set_callback($functionName)
648
  {
649
    $this::$callback = $functionName;
650
  }
651
652
  /**
653
   * Get dom node's plain text
654
   *
655
   * @return string
656
   */
657 1
  public function text()
658
  {
659 1
    return $this->fixHtmlOutput($this->document->textContent);
660
  }
661
}
662