Completed
Push — master ( 2483d3...f8ad5b )
by Lars
02:46
created

HtmlDomParser::createDOMDocument()   C

Complexity

Conditions 7
Paths 16

Size

Total Lines 45
Code Lines 24

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 27
CRAP Score 7.016

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 7
eloc 24
c 2
b 0
f 0
nc 16
nop 1
dl 0
loc 45
rs 6.7272
ccs 27
cts 29
cp 0.931
crap 7.016
1
<?php
2
3
namespace voku\helper;
4
5
use BadMethodCallException;
6
use DOMDocument;
7
use DOMXPath;
8
use InvalidArgumentException;
9
use RuntimeException;
10
11
/**
12
 * Class HtmlDomParser
13
 *
14
 * @package voku\helper
15
 *
16
 * @property string      outertext Get dom node's outer html
17
 * @property string      innertext Get dom node's inner html
18
 * @property-read string plaintext Get dom node's plain text
19
 *
20
 * @method string outertext() Get dom node's outer html
21
 * @method string innertext() Get dom node's inner html
22
 * @method HtmlDomParser load() load($html) Load HTML from string
23
 * @method HtmlDomParser load_file() load_file($html) Load HTML from file
24
 *
25
 * @method static HtmlDomParser file_get_html() file_get_html($html) Load HTML from file
26
 * @method static HtmlDomParser str_get_html() str_get_html($html) Load HTML from string
27
 */
28
class HtmlDomParser
29
{
30
  /**
31
   * @var array
32
   */
33
  protected static $functionAliases = array(
34
      'outertext' => 'html',
35
      'innertext' => 'innerHtml',
36
      'load'      => 'loadHtml',
37
      'load_file' => 'loadHtmlFile',
38
  );
39
40
  /**
41
   * @var Callable
42
   */
43
  protected static $callback;
44
45
  /**
46
   * @var DOMDocument
47
   */
48
  protected $document;
49
50
  /**
51
   * @var string
52
   */
53
  protected $encoding = 'UTF-8';
54
55
  /**
56
   * @var bool
57
   */
58
  protected $isDOMDocumentCreatedWithoutHtml = false;
59
60
  /**
61
   * @var bool
62
   */
63
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
64
65
  /**
66
   * Constructor
67
   *
68
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
69
   */
70 84
  public function __construct($element = null)
71
  {
72 84
    $this->document = new \DOMDocument('1.0', $this->getEncoding());
73
74 84
    if ($element instanceof SimpleHtmlDom) {
75 41
      $element = $element->getNode();
76 41
    }
77
78 84
    if ($element instanceof \DOMNode) {
79 41
      $domNode = $this->document->importNode($element, true);
80
81 41
      if ($domNode instanceof \DOMNode) {
82 41
        $this->document->appendChild($domNode);
83 41
      }
84
85 41
      return;
86
    }
87
88 84
    if ($element !== null) {
89 68
      $this->loadHtml($element);
90 67
    }
91 83
  }
92
93
  /**
94
   * @param $name
95
   * @param $arguments
96
   *
97
   * @return bool|mixed
98
   */
99 6 View Code Duplication
  public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
100
  {
101 6
    if (isset(self::$functionAliases[$name])) {
102 5
      return call_user_func_array(array($this, self::$functionAliases[$name]), $arguments);
103
    }
104
105 1
    throw new BadMethodCallException('Method does not exist: ' . $name);
106
  }
107
108
  /**
109
   * @param $name
110
   * @param $arguments
111
   *
112
   * @return HtmlDomParser
113
   */
114 10
  public static function __callStatic($name, $arguments)
115
  {
116 10
    if ($name == 'str_get_html') {
117 6
      $parser = new self();
118
119 6
      return $parser->loadHtml($arguments[0]);
120
    }
121
122 4
    if ($name == 'file_get_html') {
123 3
      $parser = new self();
124
125 3
      return $parser->loadHtmlFile($arguments[0]);
126
    }
127
128 1
    throw new BadMethodCallException('Method does not exist');
129
  }
130
131
  /**
132
   * @param $name
133
   *
134
   * @return string
135
   */
136 9 View Code Duplication
  public function __get($name)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
137
  {
138
    switch ($name) {
139 9
      case 'outertext':
140 6
        return $this->html();
141 3
      case 'innertext':
142 1
        return $this->innerHtml();
143 2
      case 'plaintext':
144 1
        return $this->text();
145
    }
146
147 1
    return null;
148
  }
149
150
  /**
151
   * @param string $selector
152
   * @param int    $idx
153
   *
154
   * @return SimpleHtmlDom|SimpleHtmlDomNode|null
155
   */
156 2
  public function __invoke($selector, $idx = null)
157
  {
158 2
    return $this->find($selector, $idx);
159
  }
160
161
  /**
162
   * @return string
163
   */
164 7
  public function __toString()
165
  {
166 7
    return $this->html();
167
  }
168
169
  /**
170
   * does nothing (only for api-compatibility-reasons)
171
   *
172
   * @return bool
173
   */
174 1
  public function clear()
175
  {
176 1
    return true;
177
  }
178
179
  /**
180
   * create DOMDocument from HTML
181
   *
182
   * @param string $html
183
   *
184
   * @return \DOMDocument
185
   */
186 73
  private function createDOMDocument($html)
187
  {
188 73
    if (strpos($html, '<') === false) {
189 3
      $this->isDOMDocumentCreatedWithoutHtml = true;
190 3
    }
191
192 73
    if (strpos($html, '<html') === false) {
193 28
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
194 28
    }
195
196
    // set error level
197 73
    $internalErrors = libxml_use_internal_errors(true);
198 73
    $disableEntityLoader = libxml_disable_entity_loader(true);
199 73
    libxml_clear_errors();
200
201 73
    $options = LIBXML_DTDLOAD | LIBXML_DTDATTR | LIBXML_NONET;
202 73
    if (defined(LIBXML_COMPACT)) {
203
      $options |= LIBXML_COMPACT;
204
    }
205
206 73
    $sxe = simplexml_load_string($html, 'SimpleXMLElement', $options);
207 73
    if (count(libxml_get_errors()) === 0) {
208 26
      $this->document = dom_import_simplexml($sxe)->ownerDocument;
209 26
    } else {
210 48
      $this->document->loadHTML('<?xml encoding="' . $this->getEncoding() . '">' . $html);
211
212
      // remove the "xml-encoding" hack
213 48
      foreach ($this->document->childNodes as $child) {
214 48
        if ($child->nodeType == XML_PI_NODE) {
215 48
          $this->document->removeChild($child);
216 48
        }
217 48
      }
218
219 48
      libxml_clear_errors();
220
    }
221
222
    // set encoding
223 73
    $this->document->encoding = $this->getEncoding();
224
225
    // restore lib-xml settings
226 73
    libxml_use_internal_errors($internalErrors);
227 73
    libxml_disable_entity_loader($disableEntityLoader);
228
229 73
    return $this->document;
230
  }
231
232
  /**
233
   * Callback function for preg_replace_callback use.
234
   *
235
   * @param  array $matches PREG matches
236
   *
237
   * @return string
238
   */
239 1
  protected function entityCallback(&$matches)
240
  {
241 1
    return mb_convert_encoding($matches[0], 'UTF-8', 'HTML-ENTITIES');
242
  }
243
244
  /**
245
   * Return SimpleHtmlDom by id.
246
   *
247
   * @param string $id
248
   *
249
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
250
   */
251 1
  public function getElementById($id)
252
  {
253 1
    return $this->find("#$id", 0);
254
  }
255
256
  /**
257
   * Return SimpleHtmlDom by tag name.
258
   *
259
   * @param string $name
260
   *
261
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
262
   */
263 1
  public function getElementByTagName($name)
264
  {
265 1
    return $this->find($name, 0);
266
  }
267
268
  /**
269
   * Returns Elements by id
270
   *
271
   * @param string   $id
272
   * @param null|int $idx
273
   *
274
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
275
   */
276
  public function getElementsById($id, $idx = null)
277
  {
278
    return $this->find("#$id", $idx);
279
  }
280
281
  /**
282
   * Returns Elements by tag name
283
   *
284
   * @param string   $name
285
   * @param null|int $idx
286
   *
287
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
288
   */
289 1
  public function getElementsByTagName($name, $idx = null)
290
  {
291 1
    return $this->find($name, $idx);
292
  }
293
294
  /**
295
   * Find list of nodes with a CSS selector.
296
   *
297
   * @param string $selector
298
   * @param int    $idx
299
   *
300
   * @return SimpleHtmlDom|SimpleHtmlDom[]
301
   */
302 54
  public function find($selector, $idx = null)
303
  {
304 54
    $xPathQuery = SelectorConverter::toXPath($selector);
305
306 54
    $xPath = new DOMXPath($this->document);
307 54
    $nodesList = $xPath->query($xPathQuery);
308 54
    $elements = new SimpleHtmlDomNode();
309
310 54
    foreach ($nodesList as $node) {
311 52
      $elements[] = new SimpleHtmlDom($node);
312 54
    }
313
314 54 View Code Duplication
    if (null === $idx) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
315 45
      return $elements;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $elements; (voku\helper\SimpleHtmlDomNode) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\helper\SimpleHtmlDom[].

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
316
    } else {
317 21
      if ($idx < 0) {
318 11
        $idx = count($elements) + $idx;
319 11
      }
320
    }
321
322 21
    if (isset($elements[$idx])) {
323 21
      return $elements[$idx];
324
    } else {
325
      return new SimpleHtmlDomNodeBlank();
0 ignored issues
show
Bug Best Practice introduced by
The return type of return new \voku\helper\SimpleHtmlDomNodeBlank(); (voku\helper\SimpleHtmlDomNodeBlank) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\helper\SimpleHtmlDom[].

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
326
    }
327
  }
328
329
  /**
330
   * @param string $content
331
   *
332
   * @return string
333
   */
334 21
  protected function fixHtmlOutput($content)
335
  {
336
    // INFO: DOMDocument will encapsulate plaintext into a paragraph tag (<p>),
337
    //          so we try to remove it here again ...
338 21
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
339 3
      $content = str_replace(
340
          array(
341 3
              "\n",
342 3
              '<p>', '</p>',
343 3
              "\n" . '<simpleHtmlDomP>', '<simpleHtmlDomP>', '</simpleHtmlDomP>',
344 3
              '<body>', '</body>',
345 3
              '<html>', '</html>'
346 3
          ),
347 3
          '',
348
          $content
349 3
      );
350
351 21
    } elseif ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
352 7
      $content = str_replace(
353
          array(
354 7
              "\n",
355 7
              "\n" . '<simpleHtmlDomP>', '<simpleHtmlDomP>', '</simpleHtmlDomP>',
356 7
              '<body>', '</body>',
357 7
              '<html>', '</html>'
358 7
          ),
359 7
          '',
360
          $content
361 7
      );
362 7
    }
363
364
    // replace html entities which represent UTF-8 codepoints.
365 21
    $content = preg_replace_callback("/&#\d{2,5};/", array($this, 'entityCallback'), $content);
366
367 21
    return urldecode(trim($content));
368
  }
369
370
  /**
371
   * @return DOMDocument
372
   */
373 34
  public function getDocument()
374
  {
375 34
    return $this->document;
376
  }
377
378
  /**
379
   * Get the encoding to use
380
   *
381
   * @return string
382
   */
383 84
  private function getEncoding()
384
  {
385 84
    return $this->encoding;
386
  }
387
388
  /**
389
   * @return bool
390
   */
391 5
  public function getIsDOMDocumentCreatedWithoutHtml()
392
  {
393 5
    return $this->isDOMDocumentCreatedWithoutHtml;
394
  }
395
396
  /**
397
   * @return bool
398
   */
399 18
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper()
400
  {
401 18
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
402
  }
403
404
  /**
405
   * Get dom node's outer html
406
   *
407
   * @return string
408
   */
409 18
  public function html()
410
  {
411 18
    if ($this::$callback !== null) {
412
      call_user_func_array($this::$callback, array($this));
413
    }
414
415 18
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
416 6
      $content = $this->document->saveHTML($this->document->documentElement);
417 6
    } else {
418 15
      $content = $this->document->saveHTML();
419
    }
420
421 18
    return $this->fixHtmlOutput($content);
422
  }
423
424
  /**
425
   * Get dom node's inner html
426
   *
427
   * @return string
428
   */
429 5
  public function innerHtml()
430
  {
431 5
    $text = '';
432
433 5
    foreach ($this->document->documentElement->childNodes as $node) {
434 5
      $text .= $this->fixHtmlOutput($this->document->saveHTML($node));
435 5
    }
436
437 5
    return $text;
438
  }
439
440
  /**
441
   * Load HTML from string
442
   *
443
   * @param string $html
444
   *
445
   * @return HtmlDomParser
446
   *
447
   * @throws InvalidArgumentException if argument is not string
448
   */
449 76
  public function loadHtml($html)
450
  {
451 76
    if (!is_string($html)) {
452 3
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
453
    }
454
455 73
    $this->document = $this->createDOMDocument($html);
456
457 73
    return $this;
458
  }
459
460
  /**
461
   * Load HTML from file
462
   *
463
   * @param string $filePath
464
   *
465
   * @return HtmlDomParser
466
   */
467 7
  public function loadHtmlFile($filePath)
468
  {
469 7
    if (!is_string($filePath)) {
470 2
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
471
    }
472
473 5
    if (!preg_match("/^https?:\/\//i", $filePath) && !file_exists($filePath)) {
474 1
      throw new RuntimeException("File $filePath not found");
475
    }
476
477
    try {
478 4
      $html = file_get_contents($filePath);
479 4
    } catch (\Exception $e) {
480 1
      throw new RuntimeException("Could not load file $filePath");
481
    }
482
483 3
    if ($html === false) {
484
      throw new RuntimeException("Could not load file $filePath");
485
    }
486
487 3
    $this->loadHtml($html);
488
489 3
    return $this;
490
  }
491
492
  /**
493
   * Save dom as string
494
   *
495
   * @param string $filepath
496
   *
497
   * @return string
498
   */
499 1
  public function save($filepath = '')
500
  {
501 1
    $string = $this->innerHtml();
502 1
    if ($filepath !== '') {
503
      file_put_contents($filepath, $string, LOCK_EX);
504
    }
505
506 1
    return $string;
507
  }
508
509
  /**
510
   * @param $functionName
511
   */
512
  public function set_callback($functionName)
513
  {
514
    $this::$callback = $functionName;
515
  }
516
517
  /**
518
   * Get dom node's plain text
519
   *
520
   * @return string
521
   */
522 1
  public function text()
523
  {
524 1
    return $this->document->textContent;
525
  }
526
}
527