Completed
Push — master ( 5375ae...c8ea6b )
by Lars
02:38
created

HtmlDomParser   D

Complexity

Total Complexity 80

Size/Duplication

Total Lines 664
Duplicated Lines 4.07 %

Coupling/Cohesion

Components 1
Dependencies 5

Test Coverage

Coverage 29.57%

Importance

Changes 22
Bugs 9 Features 5
Metric Value
wmc 80
c 22
b 9
f 5
lcom 1
cbo 5
dl 27
loc 664
ccs 76
cts 257
cp 0.2957
rs 4.6062

28 Methods

Rating   Name   Duplication   Size   Complexity  
A __callStatic() 0 16 3
A clear() 0 4 1
A __call() 3 10 2
B __construct() 0 26 5
B __get() 0 18 7
A __invoke() 0 4 1
A __toString() 0 4 1
B replaceToPreserveHtmlEntities() 0 27 5
A putReplacedBackToPreserveHtmlEntities() 0 16 1
F createDOMDocument() 0 79 15
A getElementById() 0 4 1
A getElementByTagName() 0 10 2
A getElementsById() 0 4 1
B getElementsByTagName() 24 24 5
B find() 0 26 5
B fixHtmlOutput() 0 42 3
A getDocument() 0 4 1
A getEncoding() 0 4 1
A getIsDOMDocumentCreatedWithoutHtml() 0 4 1
A getIsDOMDocumentCreatedWithoutHtmlWrapper() 0 4 1
A html() 0 14 3
A xml() 0 9 1
A innerHtml() 0 10 2
A loadHtml() 0 10 2
B loadHtmlFile() 0 25 6
A save() 0 9 2
A set_callback() 0 4 1
A text() 0 4 1

How to fix   Duplicated Code    Complexity   

Duplicated Code

Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.

Common duplication problems, and corresponding solutions are:

Complex Class

 Tip:   Before tackling complexity, make sure that you eliminate any duplication first. This often can reduce the size of classes significantly.

Complex classes like HtmlDomParser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use HtmlDomParser, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
namespace voku\helper;
4
5
use BadMethodCallException;
6
use DOMDocument;
7
use DOMXPath;
8
use InvalidArgumentException;
9
use RuntimeException;
10
11
/**
12
 * Class HtmlDomParser
13
 *
14
 * @package voku\helper
15
 *
16
 * @property-read string outerText Get dom node's outer html (alias for "outerHtml")
17
 * @property-read string outerHtml Get dom node's outer html
18
 * @property-read string innerText Get dom node's inner html (alias for "innerHtml")
19
 * @property-read string innerHtml Get dom node's inner html
20
 * @property-read string plaintext Get dom node's plain text
21
 *
22
 * @method string outerText() Get dom node's outer html (alias for "outerHtml()")
23
 * @method string outerHtml() Get dom node's outer html
24
 * @method string innerText() Get dom node's inner html (alias for "innerHtml()")
25
 * @method HtmlDomParser load() load($html) Load HTML from string
26
 * @method HtmlDomParser load_file() load_file($html) Load HTML from file
27
 *
28
 * @method static HtmlDomParser file_get_html() file_get_html($html) Load HTML from file
29
 * @method static HtmlDomParser str_get_html() str_get_html($html) Load HTML from string
30
 */
31
class HtmlDomParser
32
{
33
  /**
34
   * @var array
35
   */
36
  protected static $functionAliases = array(
37
      'outertext' => 'html',
38
      'outerhtml' => 'html',
39
      'innertext' => 'innerHtml',
40
      'innerhtml' => 'innerHtml',
41
      'load'      => 'loadHtml',
42
      'load_file' => 'loadHtmlFile',
43
  );
44
45
  /**
46
   * @var array
47
   */
48
  private static $domLinkReplaceHelper = array(
49
      'orig' => array('[', ']', '{', '}',),
50
      'tmp'  => array(
51
          '!!!!HTML_DOM__SQUARE_BRACKET_LEFT!!!!',
52
          '!!!!HTML_DOM__SQUARE_BRACKET_RIGHT!!!!',
53
          '!!!!HTML_DOM__BRACKET_LEFT!!!!',
54
          '!!!!HTML_DOM__BRACKET_RIGHT!!!!',
55
      ),
56
  );
57
58
  /**
59
   * @var array
60
   */
61
  protected static $domReplaceHelper = array(
62
      'orig' => array('&', '|', '+', '%'),
63
      'tmp'  => array(
64
          '!!!!HTML_DOM__AMP!!!!',
65
          '!!!!HTML_DOM__PIPE!!!!',
66
          '!!!!HTML_DOM__PLUS!!!!',
67
          '!!!!HTML_DOM__PERCENT!!!!',
68
      ),
69
  );
70
71
  /**
72
   * @var Callable
73
   */
74
  protected static $callback;
75
76
  /**
77
   * @var DOMDocument
78
   */
79
  protected $document;
80
81
  /**
82
   * @var string
83
   */
84
  protected $encoding = 'UTF-8';
85
86
  /**
87
   * @var bool
88
   */
89
  protected $isDOMDocumentCreatedWithoutHtml = false;
90
91
  /**
92
   * @var bool
93
   */
94
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
95
96
  /**
97
   * Constructor
98
   *
99
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
100
   */
101 113
  public function __construct($element = null)
102
  {
103 113
    $this->document = new \DOMDocument('1.0', $this->getEncoding());
104
105
    // DOMDocument settings
106 113
    $this->document->preserveWhiteSpace = false;
107 113
    $this->document->formatOutput = true;
108
109 113
    if ($element instanceof SimpleHtmlDom) {
110
      $element = $element->getNode();
111
    }
112
113 113
    if ($element instanceof \DOMNode) {
114
      $domNode = $this->document->importNode($element, true);
115
116
      if ($domNode instanceof \DOMNode) {
117
        $this->document->appendChild($domNode);
118
      }
119
120
      return;
121
    }
122
123 113
    if ($element !== null) {
124 65
      $this->loadHtml($element);
125
    }
126 48
  }
127
128
  /**
129
   * @param $name
130
   * @param $arguments
131
   *
132
   * @return bool|mixed
133
   */
134 29
  public function __call($name, $arguments)
135
  {
136 29
    $name = strtolower($name);
137
138 29 View Code Duplication
    if (isset(self::$functionAliases[$name])) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
139 28
      return call_user_func_array(array($this, self::$functionAliases[$name]), $arguments);
140
    }
141
142 1
    throw new BadMethodCallException('Method does not exist: ' . $name);
143
  }
144
145
  /**
146
   * @param $name
147
   * @param $arguments
148
   *
149
   * @return HtmlDomParser
150
   */
151 9
  public static function __callStatic($name, $arguments)
152
  {
153 9
    if ($name == 'str_get_html') {
154 6
      $parser = new self();
155
156 6
      return $parser->loadHtml($arguments[0]);
157
    }
158
159 3
    if ($name == 'file_get_html') {
160 2
      $parser = new self();
161
162 2
      return $parser->loadHtmlFile($arguments[0]);
163
    }
164
165 1
    throw new BadMethodCallException('Method does not exist');
166
  }
167
168
  /**
169
   * @param $name
170
   *
171
   * @return string
172
   */
173 1
  public function __get($name)
174
  {
175 1
    $name = strtolower($name);
176
177
    switch ($name) {
178 1
      case 'outerhtml':
179 1
      case 'outertext':
180
        return $this->html();
181 1
      case 'innerhtml':
182 1
      case 'innertext':
183
        return $this->innerHtml();
184 1
      case 'text':
185 1
      case 'plaintext':
186
        return $this->text();
187
    }
188
189 1
    return null;
190
  }
191
192
  /**
193
   * @param string $selector
194
   * @param int    $idx
195
   *
196
   * @return SimpleHtmlDom|SimpleHtmlDomNode|null
197
   */
198
  public function __invoke($selector, $idx = null)
199
  {
200
    return $this->find($selector, $idx);
201
  }
202
203
  /**
204
   * @return string
205
   */
206
  public function __toString()
207
  {
208
    return $this->html();
209
  }
210
211
  /**
212
   * does nothing (only for api-compatibility-reasons)
213
   *
214
   * @return bool
215
   */
216 1
  public function clear()
217
  {
218 1
    return true;
219
  }
220
221
  /**
222
   * @param string $html
223
   *
224
   * @return string
225
   */
226
  private function replaceToPreserveHtmlEntities($html)
227
  {
228
    preg_match_all("/(\bhttps?:\/\/[^\s()<>]+(?:\([\w\d]+\)|[^[:punct:]\s]|\/|\}|\]))/i", $html, $linksOld);
229
230
    $linksNew = array();
231
    if (!empty($linksOld[1])) {
232
      $linksOld = $linksOld[1];
233
      foreach ($linksOld as $linkKey => $linkOld) {
234
        $linksNew[$linkKey] = str_replace(
235
            self::$domLinkReplaceHelper['orig'],
236
            self::$domLinkReplaceHelper['tmp'],
237
            $linkOld
238
        );
239
      }
240
    }
241
242
    $linksNewCount = count($linksNew);
243
    if ($linksNewCount > 0 && count($linksOld) === $linksNewCount) {
244
      $search = array_merge($linksOld, self::$domReplaceHelper['orig']);
245
      $replace = array_merge($linksNew, self::$domReplaceHelper['tmp']);
246
    } else {
247
      $search = self::$domReplaceHelper['orig'];
248
      $replace = self::$domReplaceHelper['tmp'];
249
    }
250
251
    return str_replace($search, $replace, $html);
252
  }
253
254
  /**
255
   * @param string $html
256
   *
257
   * @return string
258
   */
259
  public static function putReplacedBackToPreserveHtmlEntities($html)
260
  {
261
    return str_replace(
262
        array_merge(
263
            self::$domLinkReplaceHelper['tmp'],
264
            self::$domReplaceHelper['tmp'],
265
            array('&#13;')
266
        ),
267
        array_merge(
268
            self::$domLinkReplaceHelper['orig'],
269
            self::$domReplaceHelper['orig'],
270
            array('')
271
        ),
272
        $html
273
    );
274
  }
275
276
  /**
277
   * create DOMDocument from HTML
278
   *
279
   * @param string $html
280
   *
281
   * @return \DOMDocument
282
   */
283 101
  private function createDOMDocument($html)
284
  {
285 101
    if (strpos($html, '<') === false) {
286 1
      $this->isDOMDocumentCreatedWithoutHtml = true;
287 1
    }
288
289 101
    if (strpos($html, '<html') === false) {
290 54
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
291 54
    }
292
293
    // set error level
294 101
    $internalErrors = libxml_use_internal_errors(true);
295 101
    $disableEntityLoader = libxml_disable_entity_loader(true);
296 101
    libxml_clear_errors();
297
298 101
    $options = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
299
300 101
    if (defined(LIBXML_PARSEHUGE)) {
301
      $options |= LIBXML_PARSEHUGE;
302
    }
303
304 101
    if (defined(LIBXML_NOXMLDECL)) {
305
      $options |= LIBXML_NOXMLDECL;
306
    }
307
308 101
    if (defined(LIBXML_BIGLINES)) {
309
      $options |= LIBXML_BIGLINES;
310
    }
311
312
    if (defined(LIBXML_COMPACT)) {
313
      $options |= LIBXML_COMPACT;
314
    }
315
316
    if (defined(LIBXML_HTML_NOIMPLIED)) {
317
      $options |= LIBXML_HTML_NOIMPLIED;
318
    }
319
320
    if (defined(LIBXML_HTML_NODEFDTD)) {
321
      $options |= LIBXML_HTML_NODEFDTD;
322
    }
323
324
    $sxe = simplexml_load_string($html, 'SimpleXMLElement', $options);
325
    if ($sxe !== false && count(libxml_get_errors()) === 0) {
326
      $this->document = dom_import_simplexml($sxe)->ownerDocument;
327
    } else {
328
329
      // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
330
      $html = trim($html);
331
      $xmlHackUsed = false;
332
      if (stripos('<?xml', $html) !== 0) {
333
        $xmlHackUsed = true;
334
        $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
335
      }
336
337
      $html = $this->replaceToPreserveHtmlEntities($html);
338
339
      $this->document->loadHTML($html, $options);
340
341
      // remove the "xml-encoding" hack
342
      if ($xmlHackUsed === true) {
343
        foreach ($this->document->childNodes as $child) {
344
          if ($child->nodeType == XML_PI_NODE) {
345
            $this->document->removeChild($child);
346
          }
347
        }
348
      }
349
350
      libxml_clear_errors();
351
    }
352
353
    // set encoding
354
    $this->document->encoding = $this->getEncoding();
355
356
    // restore lib-xml settings
357
    libxml_use_internal_errors($internalErrors);
358
    libxml_disable_entity_loader($disableEntityLoader);
359
360
    return $this->document;
361
  }
362
363
  /**
364
   * Return SimpleHtmlDom by id.
365
   *
366
   * @param string $id
367
   *
368
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
369
   */
370
  public function getElementById($id)
371
  {
372
    return $this->find("#$id", 0);
373
  }
374
375
  /**
376
   * Return SimpleHtmlDom by tag name.
377
   *
378
   * @param string $name
379
   *
380
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
381
   */
382
  public function getElementByTagName($name)
383
  {
384
    $node = $this->document->getElementsByTagName($name)->item(0);
385
386
    if ($node !== null) {
387
      return new SimpleHtmlDom($node);
388
    } else {
389
      return new SimpleHtmlDomNodeBlank();
390
    }
391
  }
392
393
  /**
394
   * Returns Elements by id
395
   *
396
   * @param string   $id
397
   * @param null|int $idx
398
   *
399
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
400
   */
401
  public function getElementsById($id, $idx = null)
402
  {
403
    return $this->find("#$id", $idx);
404
  }
405
406
  /**
407
   * Returns Elements by tag name
408
   *
409
   * @param string   $name
410
   * @param null|int $idx
411
   *
412
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
413
   */
414 View Code Duplication
  public function getElementsByTagName($name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
415
  {
416
    $nodesList = $this->document->getElementsByTagName($name);
417
418
    $elements = new SimpleHtmlDomNode();
419
420
    foreach ($nodesList as $node) {
421
      $elements[] = new SimpleHtmlDom($node);
422
    }
423
424
    if (null === $idx) {
425
      return $elements;
426
    } else {
427
      if ($idx < 0) {
428
        $idx = count($elements) + $idx;
429
      }
430
    }
431
432
    if (isset($elements[$idx])) {
433
      return $elements[$idx];
434
    } else {
435
      return new SimpleHtmlDomNodeBlank();
436
    }
437
  }
438
439
  /**
440
   * Find list of nodes with a CSS selector.
441
   *
442
   * @param string $selector
443
   * @param int    $idx
444
   *
445
   * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeBlank
446
   */
447 1
  public function find($selector, $idx = null)
448
  {
449 1
    $xPathQuery = SelectorConverter::toXPath($selector);
450
451 1
    $xPath = new DOMXPath($this->document);
452 1
    $nodesList = $xPath->query($xPathQuery);
453 1
    $elements = new SimpleHtmlDomNode();
454
455 1
    foreach ($nodesList as $node) {
456
      $elements[] = new SimpleHtmlDom($node);
457 1
    }
458
459 1
    if (null === $idx) {
460
      return $elements;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $elements; (voku\helper\SimpleHtmlDomNode) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\SimpleHtmlDomNodeBlank.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
461
    } else {
462 1
      if ($idx < 0) {
463
        $idx = count($elements) + $idx;
464
      }
465
    }
466
467 1
    if (isset($elements[$idx])) {
468
      return $elements[$idx];
469
    } else {
470 1
      return new SimpleHtmlDomNodeBlank();
471
    }
472
  }
473
474
  /**
475
   * @param string $content
476
   *
477
   * @return string
478
   */
479
  protected function fixHtmlOutput($content)
480
  {
481
    // INFO: DOMDocument will encapsulate plaintext into a paragraph tag (<p>),
482
    //          so we try to remove it here again ...
483
484
    if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
485
      $content = str_replace(
486
          array(
487
              "\n",
488
              "\r\n",
489
              "\r",
490
              '<simpleHtmlDomP>',
491
              '</simpleHtmlDomP>',
492
              '<body>',
493
              '</body>',
494
              '<html>',
495
              '</html>',
496
          ),
497
          '',
498
          $content
499
      );
500
    }
501
502
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
503
      $content = str_replace(
504
          array(
505
              '<p>',
506
              '</p>',
507
              '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">'
508
          ),
509
          '',
510
          $content);
511
    }
512
513
    $content = UTF8::html_entity_decode($content);
514
    $content = trim($content);
515
    $content = UTF8::urldecode($content);
516
517
    $content = self::putReplacedBackToPreserveHtmlEntities($content);
518
519
    return $content;
520
  }
521
522
  /**
523
   * @return DOMDocument
524
   */
525 1
  public function getDocument()
526
  {
527 1
    return $this->document;
528
  }
529
530
  /**
531
   * Get the encoding to use
532
   *
533
   * @return string
534
   */
535 113
  private function getEncoding()
536
  {
537 113
    return $this->encoding;
538
  }
539
540
  /**
541
   * @return bool
542
   */
543
  public function getIsDOMDocumentCreatedWithoutHtml()
544
  {
545
    return $this->isDOMDocumentCreatedWithoutHtml;
546
  }
547
548
  /**
549
   * @return bool
550
   */
551
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper()
552
  {
553
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
554
  }
555
556
  /**
557
   * Get dom node's outer html
558
   *
559
   * @return string
560
   */
561
  public function html()
562
  {
563
    if ($this::$callback !== null) {
564
      call_user_func_array($this::$callback, array($this));
565
    }
566
567
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
568
      $content = $this->document->saveHTML($this->document->documentElement);
569
    } else {
570
      $content = $this->document->saveHTML();
571
    }
572
573
    return $this->fixHtmlOutput($content);
574
  }
575
576
  /**
577
   * Get the HTML as XML.
578
   *
579
   * @return string
580
   */
581
  public function xml()
582
  {
583
    $xml = $this->document->saveXML(null, LIBXML_NOEMPTYTAG);
584
585
    // remove the XML-header
586
    $xml = ltrim(preg_replace('/<\?xml.*\?>/', '', $xml));
587
588
    return $this->fixHtmlOutput($xml);
589
  }
590
591
  /**
592
   * Get dom node's inner html
593
   *
594
   * @return string
595
   */
596
  public function innerHtml()
597
  {
598
    $text = '';
599
600
    foreach ($this->document->documentElement->childNodes as $node) {
601
      $text .= $this->fixHtmlOutput($this->document->saveHTML($node));
602
    }
603
604
    return $text;
605
  }
606
607
  /**
608
   * Load HTML from string
609
   *
610
   * @param string $html
611
   *
612
   * @return HtmlDomParser
613
   *
614
   * @throws InvalidArgumentException if argument is not string
615
   */
616 104
  public function loadHtml($html)
617
  {
618 104
    if (!is_string($html)) {
619 3
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
620
    }
621
622 101
    $this->document = $this->createDOMDocument($html);
623
624
    return $this;
625
  }
626
627
  /**
628
   * Load HTML from file
629
   *
630
   * @param string $filePath
631
   *
632
   * @return HtmlDomParser
633
   */
634 11
  public function loadHtmlFile($filePath)
635
  {
636 11
    if (!is_string($filePath)) {
637 2
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
638
    }
639
640 9
    if (!preg_match("/^https?:\/\//i", $filePath) && !file_exists($filePath)) {
641 1
      throw new RuntimeException("File $filePath not found");
642
    }
643
644
    try {
645 8
      $html = file_get_contents($filePath);
646
647 8
    } catch (\Exception $e) {
648 1
      throw new RuntimeException("Could not load file $filePath");
649
    }
650
651 7
    if ($html === false) {
652
      throw new RuntimeException("Could not load file $filePath");
653
    }
654
655 7
    $this->loadHtml($html);
656
657
    return $this;
658
  }
659
660
  /**
661
   * Save dom as string
662
   *
663
   * @param string $filepath
664
   *
665
   * @return string
666
   */
667
  public function save($filepath = '')
668
  {
669
    $string = $this->innerHtml();
670
    if ($filepath !== '') {
671
      file_put_contents($filepath, $string, LOCK_EX);
672
    }
673
674
    return $string;
675
  }
676
677
  /**
678
   * @param $functionName
679
   */
680
  public function set_callback($functionName)
681
  {
682
    $this::$callback = $functionName;
683
  }
684
685
  /**
686
   * Get dom node's plain text
687
   *
688
   * @return string
689
   */
690
  public function text()
691
  {
692
    return $this->fixHtmlOutput($this->document->textContent);
693
  }
694
}
695