Completed
Push — master ( 4cdcad...57d66c )
by Lars
02:26
created

HtmlDomParser::find()   B

Complexity

Conditions 5
Paths 10

Size

Total Lines 26
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 9
CRAP Score 5

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 26
ccs 9
cts 9
cp 1
rs 8.439
cc 5
eloc 16
nc 10
nop 2
crap 5
1
<?php
2
3
namespace voku\helper;
4
5
use BadMethodCallException;
6
use DOMDocument;
7
use DOMXPath;
8
use InvalidArgumentException;
9
use RuntimeException;
10
11
/**
12
 * Class HtmlDomParser
13
 *
14
 * @package voku\helper
15
 *
16
 * @property-read string outerText Get dom node's outer html (alias for "outerHtml")
17
 * @property-read string outerHtml Get dom node's outer html
18
 * @property-read string innerText Get dom node's inner html (alias for "innerHtml")
19
 * @property-read string innerHtml Get dom node's inner html
20
 * @property-read string plaintext Get dom node's plain text
21
 *
22
 * @method string outerText() Get dom node's outer html (alias for "outerHtml()")
23
 * @method string outerHtml() Get dom node's outer html
24
 * @method string innerText() Get dom node's inner html (alias for "innerHtml()")
25
 * @method HtmlDomParser load() load($html) Load HTML from string
26
 * @method HtmlDomParser load_file() load_file($html) Load HTML from file
27
 *
28
 * @method static HtmlDomParser file_get_html() file_get_html($html) Load HTML from file
29
 * @method static HtmlDomParser str_get_html() str_get_html($html) Load HTML from string
30
 */
31
class HtmlDomParser
32
{
33
  /**
34
   * @var array
35
   */
36
  protected static $functionAliases = array(
37
      'outertext' => 'html',
38
      'outerhtml' => 'html',
39
      'innertext' => 'innerHtml',
40
      'innerhtml' => 'innerHtml',
41
      'load'      => 'loadHtml',
42
      'load_file' => 'loadHtmlFile',
43
  );
44
45
  /**
46
   * @var array
47
   */
48
  private static $domLinkReplaceHelper = array(
49
      'orig' => array('[', ']', '{', '}',),
50
      'tmp'  => array(
51
          '!!!!HTML_DOM__SQUARE_BRACKET_LEFT!!!!',
52
          '!!!!HTML_DOM__SQUARE_BRACKET_RIGHT!!!!',
53
          '!!!!HTML_DOM__BRACKET_LEFT!!!!',
54
          '!!!!HTML_DOM__BRACKET_RIGHT!!!!',
55
      ),
56
  );
57
58
  /**
59
   * @var array
60
   */
61
  protected static $domReplaceHelper = array(
62
      'orig' => array('&', '|', '+', '%'),
63
      'tmp'  => array(
64
          '!!!!HTML_DOM__AMP!!!!',
65
          '!!!!HTML_DOM__PIPE!!!!',
66
          '!!!!HTML_DOM__PLUS!!!!',
67
          '!!!!HTML_DOM__PERCENT!!!!',
68
      ),
69
  );
70
71
  /**
72
   * @var Callable
73
   */
74
  protected static $callback;
75
76
  /**
77
   * @var DOMDocument
78
   */
79
  protected $document;
80
81
  /**
82
   * @var string
83
   */
84
  protected $encoding = 'UTF-8';
85
86
  /**
87
   * @var bool
88
   */
89
  protected $isDOMDocumentCreatedWithoutHtml = false;
90
91
  /**
92
   * @var bool
93
   */
94
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
95
96 85
  /**
97
   * Constructor
98 85
   *
99
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
100
   */
101 85
  public function __construct($element = null)
102 85
  {
103
    $this->document = new \DOMDocument('1.0', $this->getEncoding());
104 85
105 40
    // DOMDocument settings
106
    $this->document->preserveWhiteSpace = false;
107
    $this->document->formatOutput = true;
108 85
109 40
    if ($element instanceof SimpleHtmlDom) {
110
      $element = $element->getNode();
111 40
    }
112 40
113
    if ($element instanceof \DOMNode) {
114
      $domNode = $this->document->importNode($element, true);
115 40
116
      if ($domNode instanceof \DOMNode) {
117
        $this->document->appendChild($domNode);
118 85
      }
119 69
120
      return;
121 84
    }
122
123
    if ($element !== null) {
124
      $this->loadHtml($element);
125
    }
126
  }
127
128
  /**
129 6
   * @param $name
130
   * @param $arguments
131 6
   *
132 5
   * @return bool|mixed
133
   */
134
  public function __call($name, $arguments)
135 1
  {
136
    $name = strtolower($name);
137
138
    if (isset(self::$functionAliases[$name])) {
139
      return call_user_func_array(array($this, self::$functionAliases[$name]), $arguments);
140
    }
141
142
    throw new BadMethodCallException('Method does not exist: ' . $name);
143
  }
144 11
145
  /**
146 11
   * @param $name
147 7
   * @param $arguments
148
   *
149 7
   * @return HtmlDomParser
150
   */
151
  public static function __callStatic($name, $arguments)
152 4
  {
153 3
    if ($name == 'str_get_html') {
154
      $parser = new self();
155 3
156
      return $parser->loadHtml($arguments[0]);
157
    }
158 1
159
    if ($name == 'file_get_html') {
160
      $parser = new self();
161
162
      return $parser->loadHtmlFile($arguments[0]);
163
    }
164
165
    throw new BadMethodCallException('Method does not exist');
166 10
  }
167
168
  /**
169 10
   * @param $name
170 7
   *
171 3
   * @return string
172 1
   */
173 2
  public function __get($name)
174 1
  {
175
    $name = strtolower($name);
176
177 1
    switch ($name) {
178
      case 'outerhtml':
179
      case 'outertext':
180
        return $this->html();
181
      case 'innerhtml':
182
      case 'innertext':
183
        return $this->innerHtml();
184
      case 'text':
185
      case 'plaintext':
186 2
        return $this->text();
187
    }
188 2
189
    return null;
190
  }
191
192
  /**
193
   * @param string $selector
194 8
   * @param int    $idx
195
   *
196 8
   * @return SimpleHtmlDom|SimpleHtmlDomNode|null
197
   */
198
  public function __invoke($selector, $idx = null)
199
  {
200
    return $this->find($selector, $idx);
201
  }
202
203
  /**
204 1
   * @return string
205
   */
206 1
  public function __toString()
207
  {
208
    return $this->html();
209
  }
210
211
  /**
212
   * does nothing (only for api-compatibility-reasons)
213
   *
214 49
   * @return bool
215
   */
216 49
  public function clear()
217
  {
218 49
    return true;
219 49
  }
220 45
221 45
  /**
222 45
   * @param string $html
223 45
   *
224 45
   * @return string
225
   */
226
  private function replaceToPreserveHtmlEntities($html)
227
  {
228
    preg_match_all("/(\bhttps?:\/\/[^\s()<>]+(?:\([\w\d]+\)|[^[:punct:]\s]|\/|\}|\]))/i", $html, $linksOld);
229
230 49
    $linksNew = array();
231 49
    if (!empty($linksOld[1])) {
232 45
      $linksOld = $linksOld[1];
233 45
      foreach ($linksOld as $linkKey => $linkOld) {
234
        $linksNew[$linkKey] = str_replace(
235 5
            self::$domLinkReplaceHelper['orig'],
236 5
            self::$domLinkReplaceHelper['tmp'],
237
            $linkOld
238
        );
239 49
      }
240
    }
241
242
    $linksNewCount = count($linksNew);
243
    if ($linksNewCount > 0 && count($linksOld) === $linksNewCount) {
244
      $search = array_merge($linksOld, self::$domReplaceHelper['orig']);
245
      $replace = array_merge($linksNew, self::$domReplaceHelper['tmp']);
246
    } else {
247 30
      $search = self::$domReplaceHelper['orig'];
248
      $replace = self::$domReplaceHelper['tmp'];
249 30
    }
250
251 30
    return str_replace($search, $replace, $html);
252 30
  }
253 30
254
  /**
255
   * @param string $html
256 30
   *
257 30
   * @return string
258 30
   */
259
  public static function putReplacedBackToPreserveHtmlEntities($html)
260
  {
261
    return str_replace(
262
        array_merge(
263
            self::$domLinkReplaceHelper['tmp'],
264
            self::$domReplaceHelper['tmp'],
265
            array('&#13;')
266
        ),
267
        array_merge(
268
            self::$domLinkReplaceHelper['orig'],
269
            self::$domReplaceHelper['orig'],
270
            array('')
271 74
        ),
272
        $html
273 74
    );
274 4
  }
275
276
  /**
277 74
   * create DOMDocument from HTML
278 29
   *
279
   * @param string $html
280
   *
281
   * @return \DOMDocument
282 74
   */
283 74
  private function createDOMDocument($html)
284 74
  {
285
    if (strpos($html, '<') === false) {
286 74
      $this->isDOMDocumentCreatedWithoutHtml = true;
287 74
    }
288
289
    if (strpos($html, '<html') === false) {
290
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
291 74
    }
292 74
293 27
    // set error level
294
    $internalErrors = libxml_use_internal_errors(true);
295
    $disableEntityLoader = libxml_disable_entity_loader(true);
296
    libxml_clear_errors();
297 49
298 49
    $options = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
299 49
    if (defined(LIBXML_COMPACT)) {
300 49
      $options |= LIBXML_COMPACT;
301 49
    }
302
303
    $sxe = simplexml_load_string($html, 'SimpleXMLElement', $options);
304 49
    if ($sxe !== false && count(libxml_get_errors()) === 0) {
305
      $this->document = dom_import_simplexml($sxe)->ownerDocument;
306 49
    } else {
307
308
      // UTF-8 hack: http://php.net/manual/en/domdocument.loadhtml.php#95251
309 49
      $html = trim($html);
310 49
      $xmlHackUsed = false;
311 49
      if (stripos('<?xml', $html) !== 0) {
312 49
        $xmlHackUsed = true;
313
        $html = '<?xml encoding="' . $this->getEncoding() . '" ?>' . $html;
314
      }
315
316
      $html = $this->replaceToPreserveHtmlEntities($html);
317 49
318
      $this->document->loadHTML($html);
319
320
      // remove the "xml-encoding" hack
321 74
      if ($xmlHackUsed === true) {
322
        foreach ($this->document->childNodes as $child) {
323
          if ($child->nodeType == XML_PI_NODE) {
324 74
            $this->document->removeChild($child);
325 74
          }
326
        }
327 74
      }
328
329
      libxml_clear_errors();
330
    }
331
332
    // set encoding
333
    $this->document->encoding = $this->getEncoding();
334
335
    // restore lib-xml settings
336
    libxml_use_internal_errors($internalErrors);
337 1
    libxml_disable_entity_loader($disableEntityLoader);
338
339 1
    return $this->document;
340
  }
341
342
  /**
343
   * Return SimpleHtmlDom by id.
344
   *
345
   * @param string $id
346
   *
347
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
348
   */
349 1
  public function getElementById($id)
350
  {
351 1
    return $this->find("#$id", 0);
352
  }
353 1
354 1
  /**
355
   * Return SimpleHtmlDom by tag name.
356
   *
357
   * @param string $name
358
   *
359
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
360
   */
361
  public function getElementByTagName($name)
362
  {
363
    $node = $this->document->getElementsByTagName($name)->item(0);
364
365
    if ($node !== null) {
366
      return new SimpleHtmlDom($node);
367
    } else {
368
      return new SimpleHtmlDomNodeBlank();
369
    }
370
  }
371
372
  /**
373
   * Returns Elements by id
374
   *
375
   * @param string   $id
376
   * @param null|int $idx
377
   *
378
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
379
   */
380
  public function getElementsById($id, $idx = null)
381 1
  {
382
    return $this->find("#$id", $idx);
383 1
  }
384
385 1
  /**
386
   * Returns Elements by tag name
387 1
   *
388 1
   * @param string   $name
389
   * @param null|int $idx
390
   *
391 1
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
392
   */
393 View Code Duplication
  public function getElementsByTagName($name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
394 1
  {
395
    $nodesList = $this->document->getElementsByTagName($name);
396
397
    $elements = new SimpleHtmlDomNode();
398
399 1
    foreach ($nodesList as $node) {
400 1
      $elements[] = new SimpleHtmlDom($node);
401
    }
402
403
    if (null === $idx) {
404
      return $elements;
405
    } else {
406
      if ($idx < 0) {
407
        $idx = count($elements) + $idx;
408
      }
409
    }
410
411
    if (isset($elements[$idx])) {
412
      return $elements[$idx];
413
    } else {
414 53
      return new SimpleHtmlDomNodeBlank();
415
    }
416 53
  }
417
418 53
  /**
419 53
   * Find list of nodes with a CSS selector.
420 53
   *
421
   * @param string $selector
422 53
   * @param int    $idx
423 51
   *
424
   * @return SimpleHtmlDom|SimpleHtmlDom[]|SimpleHtmlDomNodeBlank
425
   */
426 53
  public function find($selector, $idx = null)
427 45
  {
428
    $xPathQuery = SelectorConverter::toXPath($selector);
429 20
430 11
    $xPath = new DOMXPath($this->document);
431
    $nodesList = $xPath->query($xPathQuery);
432
    $elements = new SimpleHtmlDomNode();
433
434 20
    foreach ($nodesList as $node) {
435 20
      $elements[] = new SimpleHtmlDom($node);
436
    }
437 1
438
    if (null === $idx) {
439
      return $elements;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $elements; (voku\helper\SimpleHtmlDomNode) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\SimpleHtmlDomNodeBlank.

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
440
    } else {
441
      if ($idx < 0) {
442
        $idx = count($elements) + $idx;
443
      }
444
    }
445
446 23
    if (isset($elements[$idx])) {
447
      return $elements[$idx];
448
    } else {
449
      return new SimpleHtmlDomNodeBlank();
450
    }
451 23
  }
452 9
453
  /**
454 9
   * @param string $content
455
   *
456
   * @return string
457
   */
458
  protected function fixHtmlOutput($content)
459
  {
460
    // INFO: DOMDocument will encapsulate plaintext into a paragraph tag (<p>),
461
    //          so we try to remove it here again ...
462
463
    if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
464 9
      $content = str_replace(
465
          array(
466
              "\n",
467
              "\r\n",
468
              "\r",
469 23
              '<simpleHtmlDomP>',
470 4
              '</simpleHtmlDomP>',
471
              '<body>',
472 4
              '</body>',
473
              '<html>',
474
              '</html>',
475
          ),
476 4
          '',
477
          $content
478
      );
479
    }
480 23
481 23
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
482 23
      $content = str_replace(
483
          array(
484 23
              '<p>',
485
              '</p>',
486 23
              '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">'
487
          ),
488
          '',
489
          $content);
490
    }
491
492 35
    $content = UTF8::html_entity_decode($content);
493
    $content = trim($content);
494 35
    $content = UTF8::urldecode($content);
495
496
    $content = self::putReplacedBackToPreserveHtmlEntities($content);
497
498
    return $content;
499
  }
500
501
  /**
502 85
   * @return DOMDocument
503
   */
504 85
  public function getDocument()
505
  {
506
    return $this->document;
507
  }
508
509
  /**
510 6
   * Get the encoding to use
511
   *
512 6
   * @return string
513
   */
514
  private function getEncoding()
515
  {
516
    return $this->encoding;
517
  }
518 19
519
  /**
520 19
   * @return bool
521
   */
522
  public function getIsDOMDocumentCreatedWithoutHtml()
523
  {
524
    return $this->isDOMDocumentCreatedWithoutHtml;
525
  }
526
527
  /**
528 19
   * @return bool
529
   */
530 19
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper()
531
  {
532
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
533
  }
534 19
535 7
  /**
536
   * Get dom node's outer html
537 16
   *
538
   * @return string
539
   */
540 19
  public function html()
541
  {
542
    if ($this::$callback !== null) {
543
      call_user_func_array($this::$callback, array($this));
544
    }
545
546
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
547
      $content = $this->document->saveHTML($this->document->documentElement);
548 1
    } else {
549
      $content = $this->document->saveHTML();
550 1
    }
551
552
    return $this->fixHtmlOutput($content);
553 1
  }
554
555 1
  /**
556
   * Get the HTML as XML.
557
   *
558
   * @return string
559
   */
560
  public function xml()
561
  {
562
    $xml = $this->document->saveXML(null, LIBXML_NOEMPTYTAG);
563 5
564
    // remove the XML-header
565 5
    $xml = ltrim(preg_replace('/<\?xml.*\?>/', '', $xml));
566
567 5
    return $this->fixHtmlOutput($xml);
568 5
  }
569
570
  /**
571 5
   * Get dom node's inner html
572
   *
573
   * @return string
574
   */
575
  public function innerHtml()
576
  {
577
    $text = '';
578
579
    foreach ($this->document->documentElement->childNodes as $node) {
580
      $text .= $this->fixHtmlOutput($this->document->saveHTML($node));
581
    }
582
583 77
    return $text;
584
  }
585 77
586 3
  /**
587
   * Load HTML from string
588
   *
589 74
   * @param string $html
590
   *
591 74
   * @return HtmlDomParser
592
   *
593
   * @throws InvalidArgumentException if argument is not string
594
   */
595
  public function loadHtml($html)
596
  {
597
    if (!is_string($html)) {
598
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
599
    }
600
601 7
    $this->document = $this->createDOMDocument($html);
602
603 7
    return $this;
604 2
  }
605
606
  /**
607 5
   * Load HTML from file
608 1
   *
609
   * @param string $filePath
610
   *
611
   * @return HtmlDomParser
612 4
   */
613
  public function loadHtmlFile($filePath)
614 1
  {
615 1
    if (!is_string($filePath)) {
616
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
617
    }
618 3
619
    if (!preg_match("/^https?:\/\//i", $filePath) && !file_exists($filePath)) {
620
      throw new RuntimeException("File $filePath not found");
621
    }
622 3
623
    try {
624 3
      $html = file_get_contents($filePath);
625
626
    } catch (\Exception $e) {
627
      throw new RuntimeException("Could not load file $filePath");
628
    }
629
630
    if ($html === false) {
631
      throw new RuntimeException("Could not load file $filePath");
632
    }
633
634 1
    $this->loadHtml($html);
635
636 1
    return $this;
637 1
  }
638
639
  /**
640
   * Save dom as string
641 1
   *
642
   * @param string $filepath
643
   *
644
   * @return string
645
   */
646
  public function save($filepath = '')
647
  {
648
    $string = $this->innerHtml();
649
    if ($filepath !== '') {
650
      file_put_contents($filepath, $string, LOCK_EX);
651
    }
652
653
    return $string;
654
  }
655
656
  /**
657 1
   * @param $functionName
658
   */
659 1
  public function set_callback($functionName)
660
  {
661
    $this::$callback = $functionName;
662
  }
663
664
  /**
665
   * Get dom node's plain text
666
   *
667
   * @return string
668
   */
669
  public function text()
670
  {
671
    return $this->fixHtmlOutput($this->document->textContent);
672
  }
673
}
674