Completed
Push — master ( b70373...7c5e3f )
by Lars
03:19
created

HtmlDomParser::createDOMDocument()   C

Complexity

Conditions 10
Paths 40

Size

Total Lines 58
Code Lines 31

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 36
CRAP Score 10.0145

Importance

Changes 4
Bugs 2 Features 0
Metric Value
c 4
b 2
f 0
dl 0
loc 58
ccs 36
cts 38
cp 0.9474
rs 6.6515
cc 10
eloc 31
nc 40
nop 1
crap 10.0145

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace voku\helper;
4
5
use BadMethodCallException;
6
use DOMDocument;
7
use DOMXPath;
8
use InvalidArgumentException;
9
use RuntimeException;
10
11
/**
12
 * Class HtmlDomParser
13
 *
14
 * @package voku\helper
15
 *
16
 * @property string      outertext Get dom node's outer html
17
 * @property string      innertext Get dom node's inner html
18
 * @property-read string plaintext Get dom node's plain text
19
 *
20
 * @method string outertext() Get dom node's outer html
21
 * @method string innertext() Get dom node's inner html
22
 * @method HtmlDomParser load() load($html) Load HTML from string
23
 * @method HtmlDomParser load_file() load_file($html) Load HTML from file
24
 *
25
 * @method static HtmlDomParser file_get_html() file_get_html($html) Load HTML from file
26
 * @method static HtmlDomParser str_get_html() str_get_html($html) Load HTML from string
27
 */
28
class HtmlDomParser
29
{
30
  /**
31
   * @var array
32
   */
33
  protected static $functionAliases = array(
34
      'outertext' => 'html',
35
      'innertext' => 'innerHtml',
36
      'load'      => 'loadHtml',
37
      'load_file' => 'loadHtmlFile',
38
  );
39
40
  /**
41
   * @var array
42
   */
43
  private static $domLinkReplaceHelper = array(
44
      'orig' => array('[', ']', '{', '}', '%'),
45
      'tmp'  => array(
46
          '!!!!HTML_DOM__SQUARE_BRACKET_LEFT!!!!',
47
          '!!!!HTML_DOM__SQUARE_BRACKET_RIGHT!!!!',
48
          '!!!!HTML_DOM__BRACKET_LEFT!!!!',
49
          '!!!!HTML_DOM__BRACKET_RIGHT!!!!',
50
          '!!!!HTML_DOM__PERCENT!!!!',
51
      ),
52
  );
53
54
  /**
55
   * @var array
56
   */
57
  protected static $domReplaceHelper = array(
58
      'orig' => array('&', '|', '+',),
59
      'tmp'  => array('!!!!HTML_DOM__AMP!!!!', '!!!!HTML_DOM__PIPE!!!!', '!!!!HTML_DOM__PLUS!!!!',),
60
  );
61
62
  /**
63
   * @var Callable
64
   */
65
  protected static $callback;
66
67
  /**
68
   * @var DOMDocument
69
   */
70
  protected $document;
71
72
  /**
73
   * @var string
74
   */
75
  protected $encoding = 'UTF-8';
76
77
  /**
78
   * @var bool
79
   */
80
  protected $isDOMDocumentCreatedWithoutHtml = false;
81
82
  /**
83
   * @var bool
84
   */
85
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
86
87
  /**
88
   * Constructor
89
   *
90
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
91
   */
92 85
  public function __construct($element = null)
93
  {
94 85
    $this->document = new \DOMDocument('1.0', $this->getEncoding());
95
96
    // DOMDocument settings
97 85
    $this->document->preserveWhiteSpace = false;
98 85
    $this->document->formatOutput = true;
99
100 85
    if ($element instanceof SimpleHtmlDom) {
101 40
      $element = $element->getNode();
102 40
    }
103
104 85
    if ($element instanceof \DOMNode) {
105 40
      $domNode = $this->document->importNode($element, true);
106
107 40
      if ($domNode instanceof \DOMNode) {
108 40
        $this->document->appendChild($domNode);
109 40
      }
110
111 40
      return;
112
    }
113
114 85
    if ($element !== null) {
115 69
      $this->loadHtml($element);
116 68
    }
117 84
  }
118
119
  /**
120
   * @param $name
121
   * @param $arguments
122
   *
123
   * @return bool|mixed
124
   */
125 6 View Code Duplication
  public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
126
  {
127 6
    if (isset(self::$functionAliases[$name])) {
128 5
      return call_user_func_array(array($this, self::$functionAliases[$name]), $arguments);
129
    }
130
131 1
    throw new BadMethodCallException('Method does not exist: ' . $name);
132
  }
133
134
  /**
135
   * @param $name
136
   * @param $arguments
137
   *
138
   * @return HtmlDomParser
139
   */
140 11
  public static function __callStatic($name, $arguments)
141
  {
142 11
    if ($name == 'str_get_html') {
143 7
      $parser = new self();
144
145 7
      return $parser->loadHtml($arguments[0]);
146
    }
147
148 4
    if ($name == 'file_get_html') {
149 3
      $parser = new self();
150
151 3
      return $parser->loadHtmlFile($arguments[0]);
152
    }
153
154 1
    throw new BadMethodCallException('Method does not exist');
155
  }
156
157
  /**
158
   * @param $name
159
   *
160
   * @return string
161
   */
162 10 View Code Duplication
  public function __get($name)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
163
  {
164
    switch ($name) {
165 10
      case 'outertext':
166 7
        return $this->html();
167 3
      case 'innertext':
168 1
        return $this->innerHtml();
169 2
      case 'plaintext':
170 1
        return $this->text();
171
    }
172
173 1
    return null;
174
  }
175
176
  /**
177
   * @param string $selector
178
   * @param int    $idx
179
   *
180
   * @return SimpleHtmlDom|SimpleHtmlDomNode|null
181
   */
182 2
  public function __invoke($selector, $idx = null)
183
  {
184 2
    return $this->find($selector, $idx);
185
  }
186
187
  /**
188
   * @return string
189
   */
190 8
  public function __toString()
191
  {
192 8
    return $this->html();
193
  }
194
195
  /**
196
   * does nothing (only for api-compatibility-reasons)
197
   *
198
   * @return bool
199
   */
200 1
  public function clear()
201
  {
202 1
    return true;
203
  }
204
205
  /**
206
   * @param string $html
207
   *
208
   * @return string
209
   */
210 49
  private function replaceToPreserveHtmlEntities($html)
211
  {
212 49
    preg_match_all("/(\bhttps?:\/\/[^\s()<>]+(?:\([\w\d]+\)|[^[:punct:]\s]|\/|\}|\]))/i", $html, $linksOld);
213
214 49
    $linksNew = array();
215 49
    if (!empty($linksOld[1])) {
216 45
      $linksOld = $linksOld[1];
217 45
      foreach ($linksOld as $linkKey => $linkOld) {
218 45
        $linksNew[$linkKey] = str_replace(
219 45
            self::$domLinkReplaceHelper['orig'],
220 45
            self::$domLinkReplaceHelper['tmp'],
221
            $linkOld
222 45
        );
223 45
      }
224 45
    }
225
226 49
    $linksNewCount = count($linksNew);
227 49
    if ($linksNewCount > 0 && count($linksOld) === $linksNewCount) {
228 45
      $search = array_merge($linksOld, self::$domReplaceHelper['orig']);
229 45
      $replace = array_merge($linksNew, self::$domReplaceHelper['tmp']);
230 45
    } else {
231 5
      $search = self::$domReplaceHelper['orig'];
232 5
      $replace = self::$domReplaceHelper['tmp'];
233
    }
234
235 49
    return str_replace($search, $replace, $html);
236
  }
237
238
  /**
239
   * @param string $html
240
   *
241
   * @return string
242
   */
243 30
  public static function putReplacedBackToPreserveHtmlEntities($html)
244
  {
245 30
    return str_replace(
246 30
        array_merge(
247 30
            self::$domLinkReplaceHelper['tmp'],
248 30
            self::$domReplaceHelper['tmp'],
249 30
            array('&#13;')
250 30
        ),
251 30
        array_merge(
252 30
            self::$domLinkReplaceHelper['orig'],
253 30
            self::$domReplaceHelper['orig'],
254 30
            array('')
255 30
        ),
256
        $html
257 30
    );
258
  }
259
260
  /**
261
   * create DOMDocument from HTML
262
   *
263
   * @param string $html
264
   *
265
   * @return \DOMDocument
266
   */
267 74
  private function createDOMDocument($html)
268
  {
269 74
    if (strpos($html, '<') === false) {
270 4
      $this->isDOMDocumentCreatedWithoutHtml = true;
271 4
    }
272
273 74
    if (strpos($html, '<html') === false) {
274 29
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
275 29
    }
276
277
    // set error level
278 74
    $internalErrors = libxml_use_internal_errors(true);
279 74
    $disableEntityLoader = libxml_disable_entity_loader(true);
280 74
    libxml_clear_errors();
281
282 74
    $options = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
283 74
    if (defined(LIBXML_COMPACT)) {
284
      $options |= LIBXML_COMPACT;
285
    }
286
287 74
    $sxe = simplexml_load_string($html, 'SimpleXMLElement', $options);
288 74
    if ($sxe !== false && count(libxml_get_errors()) === 0) {
289 27
      $this->document = dom_import_simplexml($sxe)->ownerDocument;
290 27
    } else {
291
292
      // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
293 49
      $html = trim($html);
294 49
      $xmlHackUsed = false;
295 49
      if (stripos('<?xml', $html) !== 0) {
296 49
        $xmlHackUsed = true;
297 49
        $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
298 49
      }
299
300 49
      $html = $this->replaceToPreserveHtmlEntities($html);
301
302 49
      $this->document->loadHTML($html);
303
304
      // remove the "xml-encoding" hack
305 49
      if ($xmlHackUsed === true) {
306 49
        foreach ($this->document->childNodes as $child) {
307 49
          if ($child->nodeType == XML_PI_NODE) {
308 49
            $this->document->removeChild($child);
309 49
          }
310 49
        }
311 49
      }
312
313 49
      libxml_clear_errors();
314
    }
315
316
    // set encoding
317 74
    $this->document->encoding = $this->getEncoding();
318
319
    // restore lib-xml settings
320 74
    libxml_use_internal_errors($internalErrors);
321 74
    libxml_disable_entity_loader($disableEntityLoader);
322
323 74
    return $this->document;
324
  }
325
326
  /**
327
   * Return SimpleHtmlDom by id.
328
   *
329
   * @param string $id
330
   *
331
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
332
   */
333 1
  public function getElementById($id)
334
  {
335 1
    return $this->find("#$id", 0);
336
  }
337
338
  /**
339
   * Return SimpleHtmlDom by tag name.
340
   *
341
   * @param string $name
342
   *
343
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
344
   */
345 1
  public function getElementByTagName($name)
346
  {
347 1
    $node = $this->document->getElementsByTagName($name)->item(0);
348
349 1
    if ($node !== null) {
350 1
      return new SimpleHtmlDom($node);
351
    } else {
352
      return new SimpleHtmlDomNodeBlank();
353
    }
354
  }
355
356
  /**
357
   * Returns Elements by id
358
   *
359
   * @param string   $id
360
   * @param null|int $idx
361
   *
362
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
363
   */
364
  public function getElementsById($id, $idx = null)
365
  {
366
    return $this->find("#$id", $idx);
367
  }
368
369
  /**
370
   * Returns Elements by tag name
371
   *
372
   * @param string   $name
373
   * @param null|int $idx
374
   *
375
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
376
   */
377 1 View Code Duplication
  public function getElementsByTagName($name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
378
  {
379 1
    $nodesList = $this->document->getElementsByTagName($name);
380
381 1
    $elements = new SimpleHtmlDomNode();
382
383 1
    foreach ($nodesList as $node) {
384 1
      $elements[] = new SimpleHtmlDom($node);
385 1
    }
386
387 1
    if (null === $idx) {
388
      return $elements;
389
    } else {
390 1
      if ($idx < 0) {
391
        $idx = count($elements) + $idx;
392
      }
393
    }
394
395 1
    if (isset($elements[$idx])) {
396 1
      return $elements[$idx];
397
    } else {
398
      return new SimpleHtmlDomNodeBlank();
399
    }
400
  }
401
402
  /**
403
   * Find list of nodes with a CSS selector.
404
   *
405
   * @param string $selector
406
   * @param int    $idx
407
   *
408
   * @return SimpleHtmlDom|SimpleHtmlDom[]
409
   */
410 53
  public function find($selector, $idx = null)
411
  {
412 53
    $xPathQuery = SelectorConverter::toXPath($selector);
413
414 53
    $xPath = new DOMXPath($this->document);
415 53
    $nodesList = $xPath->query($xPathQuery);
416 53
    $elements = new SimpleHtmlDomNode();
417
418 53
    foreach ($nodesList as $node) {
419 51
      $elements[] = new SimpleHtmlDom($node);
420 53
    }
421
422 53
    if (null === $idx) {
423 45
      return $elements;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $elements; (voku\helper\SimpleHtmlDomNode) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\helper\SimpleHtmlDom[].

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
424
    } else {
425 20
      if ($idx < 0) {
426 11
        $idx = count($elements) + $idx;
427 11
      }
428
    }
429
430 20
    if (isset($elements[$idx])) {
431 20
      return $elements[$idx];
432
    } else {
433
      return new SimpleHtmlDomNodeBlank();
0 ignored issues
show
Bug Best Practice introduced by
The return type of return new \voku\helper\SimpleHtmlDomNodeBlank(); (voku\helper\SimpleHtmlDomNodeBlank) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\helper\SimpleHtmlDom[].

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
434
    }
435
  }
436
437
  /**
438
   * @param string $content
439
   *
440
   * @return string
441
   */
442 23
  protected function fixHtmlOutput($content)
443
  {
444
    // INFO: DOMDocument will encapsulate plaintext into a paragraph tag (<p>),
445
    //          so we try to remove it here again ...
446
447 23
    if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
448 9
      $content = str_replace(
449
          array(
450 9
              "\n",
451 9
              "\r\n",
452 9
              "\r",
453 9
              '<simpleHtmlDomP>',
454 9
              '</simpleHtmlDomP>',
455 9
              '<body>',
456 9
              '</body>',
457 9
              '<html>',
458 9
              '</html>',
459 9
          ),
460 9
          '',
461
          $content
462 9
      );
463 9
    }
464
465 23
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
466 4
      $content = str_replace(
467
          array(
468 4
              '<p>',
469 4
              '</p>',
470
              '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">'
471 4
          ),
472 4
          '',
473 4
          $content);
474 4
    }
475
476 23
    $content = UTF8::html_entity_decode($content);
477 23
    $content = trim($content);
478 23
    $content = UTF8::urldecode($content);
479
480 23
    $content = self::putReplacedBackToPreserveHtmlEntities($content);
481
482 23
    return $content;
483
  }
484
485
  /**
486
   * @return DOMDocument
487
   */
488 35
  public function getDocument()
489
  {
490 35
    return $this->document;
491
  }
492
493
  /**
494
   * Get the encoding to use
495
   *
496
   * @return string
497
   */
498 85
  private function getEncoding()
499
  {
500 85
    return $this->encoding;
501
  }
502
503
  /**
504
   * @return bool
505
   */
506 6
  public function getIsDOMDocumentCreatedWithoutHtml()
507
  {
508 6
    return $this->isDOMDocumentCreatedWithoutHtml;
509
  }
510
511
  /**
512
   * @return bool
513
   */
514 19
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper()
515
  {
516 19
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
517
  }
518
519
  /**
520
   * Get dom node's outer html
521
   *
522
   * @return string
523
   */
524 19
  public function html()
525
  {
526 19
    if ($this::$callback !== null) {
527
      call_user_func_array($this::$callback, array($this));
528
    }
529
530 19
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
531 7
      $content = $this->document->saveHTML($this->document->documentElement);
532 7
    } else {
533 16
      $content = $this->document->saveHTML();
534
    }
535
536 19
    return $this->fixHtmlOutput($content);
537
  }
538
539
  /**
540
   * Get the HTML as XML.
541
   *
542
   * @return string
543
   */
544 1
  public function xml()
545
  {
546 1
    $xml = $this->document->saveXML(null, LIBXML_NOEMPTYTAG);
547
548
    // remove the XML-header
549 1
    $xml = ltrim(preg_replace('/<\?xml.*\?>/', '', $xml));
550
551 1
    return $this->fixHtmlOutput($xml);
552
  }
553
554
  /**
555
   * Get dom node's inner html
556
   *
557
   * @return string
558
   */
559 5
  public function innerHtml()
560
  {
561 5
    $text = '';
562
563 5
    foreach ($this->document->documentElement->childNodes as $node) {
564 5
      $text .= $this->fixHtmlOutput($this->document->saveHTML($node));
565 5
    }
566
567 5
    return $text;
568
  }
569
570
  /**
571
   * Load HTML from string
572
   *
573
   * @param string $html
574
   *
575
   * @return HtmlDomParser
576
   *
577
   * @throws InvalidArgumentException if argument is not string
578
   */
579 77
  public function loadHtml($html)
580
  {
581 77
    if (!is_string($html)) {
582 3
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
583
    }
584
585 74
    $this->document = $this->createDOMDocument($html);
586
587 74
    return $this;
588
  }
589
590
  /**
591
   * Load HTML from file
592
   *
593
   * @param string $filePath
594
   *
595
   * @return HtmlDomParser
596
   */
597 7
  public function loadHtmlFile($filePath)
598
  {
599 7
    if (!is_string($filePath)) {
600 2
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
601
    }
602
603 5
    if (!preg_match("/^https?:\/\//i", $filePath) && !file_exists($filePath)) {
604 1
      throw new RuntimeException("File $filePath not found");
605
    }
606
607
    try {
608 4
      $html = file_get_contents($filePath);
609
610 4
    } catch (\Exception $e) {
611 1
      throw new RuntimeException("Could not load file $filePath");
612
    }
613
614 3
    if ($html === false) {
615
      throw new RuntimeException("Could not load file $filePath");
616
    }
617
618 3
    $this->loadHtml($html);
619
620 3
    return $this;
621
  }
622
623
  /**
624
   * Save dom as string
625
   *
626
   * @param string $filepath
627
   *
628
   * @return string
629
   */
630 1
  public function save($filepath = '')
631
  {
632 1
    $string = $this->innerHtml();
633 1
    if ($filepath !== '') {
634
      file_put_contents($filepath, $string, LOCK_EX);
635
    }
636
637 1
    return $string;
638
  }
639
640
  /**
641
   * @param $functionName
642
   */
643
  public function set_callback($functionName)
644
  {
645
    $this::$callback = $functionName;
646
  }
647
648
  /**
649
   * Get dom node's plain text
650
   *
651
   * @return string
652
   */
653 1
  public function text()
654
  {
655 1
    return $this->fixHtmlOutput($this->document->textContent);
656
  }
657
}
658