Completed
Push — master ( 3a68df...3c8147 )
by Lars
03:00
created

HtmlDomParser::innerHtml()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 10
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 2

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 10
ccs 6
cts 6
cp 1
rs 9.4285
cc 2
eloc 5
nc 2
nop 0
crap 2
1
<?php
2
3
namespace voku\helper;
4
5
use BadMethodCallException;
6
use DOMDocument;
7
use DOMXPath;
8
use InvalidArgumentException;
9
use RuntimeException;
10
11
/**
12
 * Class HtmlDomParser
13
 *
14
 * @package voku\helper
15
 *
16
 * @property string      outertext Get dom node's outer html
17
 * @property string      innertext Get dom node's inner html
18
 * @property-read string plaintext Get dom node's plain text
19
 *
20
 * @method string outertext() Get dom node's outer html
21
 * @method string innertext() Get dom node's inner html
22
 * @method HtmlDomParser load() load($html) Load HTML from string
23
 * @method HtmlDomParser load_file() load_file($html) Load HTML from file
24
 *
25
 * @method static HtmlDomParser file_get_html() file_get_html($html) Load HTML from file
26
 * @method static HtmlDomParser str_get_html() str_get_html($html) Load HTML from string
27
 */
28
class HtmlDomParser
29
{
30
  /**
31
   * @var array
32
   */
33
  protected static $functionAliases = array(
34
      'outertext' => 'html',
35
      'innertext' => 'innerHtml',
36
      'load'      => 'loadHtml',
37
      'load_file' => 'loadHtmlFile',
38
  );
39
40
  /**
41
   * @var Callable
42
   */
43
  protected static $callback;
44
45
  /**
46
   * @var DOMDocument
47
   */
48
  protected $document;
49
50
  /**
51
   * @var string
52
   */
53
  protected $encoding = 'UTF-8';
54
55
  /**
56
   * @var bool
57
   */
58
  protected $isDOMDocumentCreatedWithoutHtml = false;
59
60
  /**
61
   * @var bool
62
   */
63
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
64
65
  /**
66
   * Constructor
67
   *
68
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
69
   */
70 84
  public function __construct($element = null)
71
  {
72 84
    $this->document = new \DOMDocument('1.0', $this->getEncoding());
73
74 84
    if ($element instanceof SimpleHtmlDom) {
75 39
      $element = $element->getNode();
76 39
    }
77
78 84
    if ($element instanceof \DOMNode) {
79 39
      $domNode = $this->document->importNode($element, true);
80
81 39
      if ($domNode instanceof \DOMNode) {
82 39
        $this->document->appendChild($domNode);
83 39
      }
84
85 39
      return;
86
    }
87
88 84
    if ($element !== null) {
89 68
      $this->loadHtml($element);
90 67
    }
91 83
  }
92
93
  /**
94
   * @param $name
95
   * @param $arguments
96
   *
97
   * @return bool|mixed
98
   */
99 6 View Code Duplication
  public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
100
  {
101 6
    if (isset(self::$functionAliases[$name])) {
102 5
      return call_user_func_array(array($this, self::$functionAliases[$name]), $arguments);
103
    }
104
105 1
    throw new BadMethodCallException('Method does not exist: ' . $name);
106
  }
107
108
  /**
109
   * @param $name
110
   * @param $arguments
111
   *
112
   * @return HtmlDomParser
113
   */
114 10
  public static function __callStatic($name, $arguments)
115
  {
116 10
    if ($name == 'str_get_html') {
117 6
      $parser = new self();
118
119 6
      return $parser->loadHtml($arguments[0]);
120
    }
121
122 4
    if ($name == 'file_get_html') {
123 3
      $parser = new self();
124
125 3
      return $parser->loadHtmlFile($arguments[0]);
126
    }
127
128 1
    throw new BadMethodCallException('Method does not exist');
129
  }
130
131
  /**
132
   * @param $name
133
   *
134
   * @return string
135
   */
136 9 View Code Duplication
  public function __get($name)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
137
  {
138
    switch ($name) {
139 9
      case 'outertext':
140 6
        return $this->html();
141 3
      case 'innertext':
142 1
        return $this->innerHtml();
143 2
      case 'plaintext':
144 1
        return $this->text();
145
    }
146
147 1
    return null;
148
  }
149
150
  /**
151
   * @param string $selector
152
   * @param int    $idx
153
   *
154
   * @return SimpleHtmlDom|SimpleHtmlDomNode|null
155
   */
156 2
  public function __invoke($selector, $idx = null)
157
  {
158 2
    return $this->find($selector, $idx);
159
  }
160
161
  /**
162
   * @return string
163
   */
164 7
  public function __toString()
165
  {
166 7
    return $this->html();
167
  }
168
169
  /**
170
   * does nothing (only for api-compatibility-reasons)
171
   *
172
   * @return bool
173
   */
174 1
  public function clear()
175
  {
176 1
    return true;
177
  }
178
179
  /**
180
   * create DOMDocument from HTML
181
   *
182
   * @param string $html
183
   *
184
   * @return \DOMDocument
185
   */
186 73
  private function createDOMDocument($html)
187
  {
188 73
    if (strpos($html, '<') === false) {
189 3
      $this->isDOMDocumentCreatedWithoutHtml = true;
190 3
    }
191
192 73
    if (strpos($html, '<html') === false) {
193 28
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
194 28
    }
195
196
    // set error level
197 73
    $internalErrors = libxml_use_internal_errors(true);
198 73
    $disableEntityLoader = libxml_disable_entity_loader(true);
199 73
    libxml_clear_errors();
200
201 73
    $options = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
202 73
    if (defined(LIBXML_COMPACT)) {
203
      $options |= LIBXML_COMPACT;
204
    }
205
206 73
    $sxe = simplexml_load_string($html, 'SimpleXMLElement', $options);
207 73
    if (count(libxml_get_errors()) === 0) {
208 26
      $this->document = dom_import_simplexml($sxe)->ownerDocument;
209 26
    } else {
210 48
      $this->document->loadHTML('<?xml encoding="' . $this->getEncoding() . '">' . $html);
211
212
      // remove the "xml-encoding" hack
213 48
      foreach ($this->document->childNodes as $child) {
214 48
        if ($child->nodeType == XML_PI_NODE) {
215 48
          $this->document->removeChild($child);
216 48
        }
217 48
      }
218
219 48
      libxml_clear_errors();
220
    }
221
222
    // set encoding
223 73
    $this->document->encoding = $this->getEncoding();
224
225
    // restore lib-xml settings
226 73
    libxml_use_internal_errors($internalErrors);
227 73
    libxml_disable_entity_loader($disableEntityLoader);
228
229 73
    return $this->document;
230
  }
231
232
  /**
233
   * Callback function for preg_replace_callback use.
234
   *
235
   * @param  array $matches PREG matches
236
   *
237
   * @return string
238
   */
239 1
  protected function entityCallback(&$matches)
240
  {
241 1
    return mb_convert_encoding($matches[0], 'UTF-8', 'HTML-ENTITIES');
242
  }
243
244
  /**
245
   * Return SimpleHtmlDom by id.
246
   *
247
   * @param string $id
248
   *
249
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
250
   */
251 1
  public function getElementById($id)
252
  {
253 1
    return $this->find("#$id", 0);
254
  }
255
256
  /**
257
   * Return SimpleHtmlDom by tag name.
258
   *
259
   * @param string $name
260
   *
261
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
262
   */
263 1 View Code Duplication
  public function getElementByTagName($name)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
264
  {
265 1
    $node = $this->document->getElementsByTagName($name)->item(0);
266
267 1
    if ($node !== null) {
268 1
      return new SimpleHtmlDom($node);
269
    } else {
270
      return new SimpleHtmlDomNodeBlank();
271
    }
272
  }
273
274
  /**
275
   * Returns Elements by id
276
   *
277
   * @param string   $id
278
   * @param null|int $idx
279
   *
280
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
281
   */
282
  public function getElementsById($id, $idx = null)
283
  {
284
    return $this->find("#$id", $idx);
285
  }
286
287
  /**
288
   * Returns Elements by tag name
289
   *
290
   * @param string   $name
291
   * @param null|int $idx
292
   *
293
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
294
   */
295 1 View Code Duplication
  public function getElementsByTagName($name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
296
  {
297 1
    $nodesList = $this->document->getElementsByTagName($name);
298
299 1
    $elements = new SimpleHtmlDomNode();
300
301 1
    foreach ($nodesList as $node) {
302 1
      $elements[] = new SimpleHtmlDom($node);
303 1
    }
304
305 1
    if (null === $idx) {
306
      return $elements;
307
    } else {
308 1
      if ($idx < 0) {
309
        $idx = count($elements) + $idx;
310
      }
311
    }
312
313 1
    if (isset($elements[$idx])) {
314 1
      return $elements[$idx];
315
    } else {
316
      return new SimpleHtmlDomNodeBlank();
317
    }
318
  }
319
320
  /**
321
   * Find list of nodes with a CSS selector.
322
   *
323
   * @param string $selector
324
   * @param int    $idx
325
   *
326
   * @return SimpleHtmlDom|SimpleHtmlDom[]
327
   */
328 52
  public function find($selector, $idx = null)
329
  {
330 52
    $xPathQuery = SelectorConverter::toXPath($selector);
331
332 52
    $xPath = new DOMXPath($this->document);
333 52
    $nodesList = $xPath->query($xPathQuery);
334 52
    $elements = new SimpleHtmlDomNode();
335
336 52
    foreach ($nodesList as $node) {
337 50
      $elements[] = new SimpleHtmlDom($node);
338 52
    }
339
340 52
    if (null === $idx) {
341 44
      return $elements;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $elements; (voku\helper\SimpleHtmlDomNode) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\helper\SimpleHtmlDom[].

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
342
    } else {
343 20
      if ($idx < 0) {
344 11
        $idx = count($elements) + $idx;
345 11
      }
346
    }
347
348 20
    if (isset($elements[$idx])) {
349 20
      return $elements[$idx];
350
    } else {
351
      return new SimpleHtmlDomNodeBlank();
0 ignored issues
show
Bug Best Practice introduced by
The return type of return new \voku\helper\SimpleHtmlDomNodeBlank(); (voku\helper\SimpleHtmlDomNodeBlank) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\helper\SimpleHtmlDom[].

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
352
    }
353
  }
354
355
  /**
356
   * @param string $content
357
   *
358
   * @return string
359
   */
360 21
  protected function fixHtmlOutput($content)
361
  {
362
    // INFO: DOMDocument will encapsulate plaintext into a paragraph tag (<p>),
363
    //          so we try to remove it here again ...
364 21
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
365 3
      $content = str_replace(
366
          array(
367 3
              "\n",
368 3
              '<p>', '</p>',
369 3
              "\n" . '<simpleHtmlDomP>', '<simpleHtmlDomP>', '</simpleHtmlDomP>',
370 3
              '<body>', '</body>',
371 3
              '<html>', '</html>'
372 3
          ),
373 3
          '',
374
          $content
375 3
      );
376
377 21
    } elseif ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
378 7
      $content = str_replace(
379
          array(
380 7
              "\n",
381 7
              "\n" . '<simpleHtmlDomP>', '<simpleHtmlDomP>', '</simpleHtmlDomP>',
382 7
              '<body>', '</body>',
383 7
              '<html>', '</html>'
384 7
          ),
385 7
          '',
386
          $content
387 7
      );
388 7
    }
389
390
    // replace html entities which represent UTF-8 codepoints.
391 21
    $content = preg_replace_callback("/&#\d{2,5};/", array($this, 'entityCallback'), $content);
392
393 21
    return urldecode(trim($content));
394
  }
395
396
  /**
397
   * @return DOMDocument
398
   */
399 34
  public function getDocument()
400
  {
401 34
    return $this->document;
402
  }
403
404
  /**
405
   * Get the encoding to use
406
   *
407
   * @return string
408
   */
409 84
  private function getEncoding()
410
  {
411 84
    return $this->encoding;
412
  }
413
414
  /**
415
   * @return bool
416
   */
417 5
  public function getIsDOMDocumentCreatedWithoutHtml()
418
  {
419 5
    return $this->isDOMDocumentCreatedWithoutHtml;
420
  }
421
422
  /**
423
   * @return bool
424
   */
425 18
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper()
426
  {
427 18
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
428
  }
429
430
  /**
431
   * Get dom node's outer html
432
   *
433
   * @return string
434
   */
435 18
  public function html()
436
  {
437 18
    if ($this::$callback !== null) {
438
      call_user_func_array($this::$callback, array($this));
439
    }
440
441 18
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
442 6
      $content = $this->document->saveHTML($this->document->documentElement);
443 6
    } else {
444 15
      $content = $this->document->saveHTML();
445
    }
446
447 18
    return $this->fixHtmlOutput($content);
448
  }
449
450
  /**
451
   * Get dom node's inner html
452
   *
453
   * @return string
454
   */
455 5
  public function innerHtml()
456
  {
457 5
    $text = '';
458
459 5
    foreach ($this->document->documentElement->childNodes as $node) {
460 5
      $text .= $this->fixHtmlOutput($this->document->saveHTML($node));
461 5
    }
462
463 5
    return $text;
464
  }
465
466
  /**
467
   * Load HTML from string
468
   *
469
   * @param string $html
470
   *
471
   * @return HtmlDomParser
472
   *
473
   * @throws InvalidArgumentException if argument is not string
474
   */
475 76
  public function loadHtml($html)
476
  {
477 76
    if (!is_string($html)) {
478 3
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
479
    }
480
481 73
    $this->document = $this->createDOMDocument($html);
482
483 73
    return $this;
484
  }
485
486
  /**
487
   * Load HTML from file
488
   *
489
   * @param string $filePath
490
   *
491
   * @return HtmlDomParser
492
   */
493 7
  public function loadHtmlFile($filePath)
494
  {
495 7
    if (!is_string($filePath)) {
496 2
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
497
    }
498
499 5
    if (!preg_match("/^https?:\/\//i", $filePath) && !file_exists($filePath)) {
500 1
      throw new RuntimeException("File $filePath not found");
501
    }
502
503
    try {
504 4
      $html = file_get_contents($filePath);
505 4
    } catch (\Exception $e) {
506 1
      throw new RuntimeException("Could not load file $filePath");
507
    }
508
509 3
    if ($html === false) {
510
      throw new RuntimeException("Could not load file $filePath");
511
    }
512
513 3
    $this->loadHtml($html);
514
515 3
    return $this;
516
  }
517
518
  /**
519
   * Save dom as string
520
   *
521
   * @param string $filepath
522
   *
523
   * @return string
524
   */
525 1
  public function save($filepath = '')
526
  {
527 1
    $string = $this->innerHtml();
528 1
    if ($filepath !== '') {
529
      file_put_contents($filepath, $string, LOCK_EX);
530
    }
531
532 1
    return $string;
533
  }
534
535
  /**
536
   * @param $functionName
537
   */
538
  public function set_callback($functionName)
539
  {
540
    $this::$callback = $functionName;
541
  }
542
543
  /**
544
   * Get dom node's plain text
545
   *
546
   * @return string
547
   */
548 1
  public function text()
549
  {
550 1
    return $this->document->textContent;
551
  }
552
}
553