Completed
Push — master ( 05ef16...3f1b18 )
by Lars
02:53
created

HtmlDomParser::clear()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 4
ccs 2
cts 2
cp 1
rs 10
cc 1
eloc 2
nc 1
nop 0
crap 1
1
<?php
2
3
namespace voku\helper;
4
5
use BadMethodCallException;
6
use DOMDocument;
7
use DOMXPath;
8
use InvalidArgumentException;
9
use RuntimeException;
10
11
/**
12
 * Class HtmlDomParser
13
 *
14
 * @package FastSimpleHTMLDom
15
 *
16
 * @property string      outertext Get dom node's outer html
17
 * @property string      innertext Get dom node's inner html
18
 * @property-read string plaintext Get dom node's plain text
19
 *
20
 * @method string outertext() Get dom node's outer html
21
 * @method string innertext() Get dom node's inner html
22
 * @method HtmlDomParser load() load($html) Load HTML from string
23
 * @method HtmlDomParser load_file() load_file($html) Load HTML from file
24
 *
25
 * @method static HtmlDomParser file_get_html() file_get_html($html) Load HTML from file
26
 * @method static HtmlDomParser str_get_html() str_get_html($html) Load HTML from string
27
 */
28
class HtmlDomParser
29
{
30
  /**
31
   * @var array
32
   */
33
  protected static $functionAliases = array(
34
      'outertext' => 'html',
35
      'innertext' => 'innerHtml',
36
      'load'      => 'loadHtml',
37
      'load_file' => 'loadHtmlFile',
38
  );
39
40
  /**
41
   * @var Callable
42
   */
43
  static protected $callback;
44
45
  /**
46
   * @var DOMDocument
47
   */
48
  protected $document;
49
50
  /**
51
   * @var string
52
   */
53
  protected $encoding = 'UTF-8';
54
55
  /**
56
   * @var bool
57
   */
58
  protected $isDOMDocumentCreatedWithoutHtml = false;
59
60
  /**
61
   * @var bool
62
   */
63
  protected $isDOMDocumentCreatedWithoutHtmlWrapper = false;
64
65
  /**
66
   * Constructor
67
   *
68
   * @param string|SimpleHtmlDom|\DOMNode $element HTML code or SimpleHtmlDom, \DOMNode
69
   */
70 84
  public function __construct($element = null)
71
  {
72 84
    $this->document = new DOMDocument('1.0', $this->getEncoding());
73
74 84
    if ($element instanceof SimpleHtmlDom) {
75 41
      $element = $element->getNode();
76 41
    }
77
78 84
    if ($element instanceof \DOMNode) {
79 41
      $domNode = $this->document->importNode($element, true);
80
81 41
      if ($domNode instanceof \DOMNode) {
82 41
        $this->document->appendChild($domNode);
83 41
      }
84
85 41
      return;
86
    }
87
88 84
    if ($element !== null) {
89 68
      $this->loadHtml($element);
90 67
    }
91 83
  }
92
93
  /**
94
   * @param $name
95
   * @param $arguments
96
   *
97
   * @return bool|mixed
98
   */
99 6 View Code Duplication
  public function __call($name, $arguments)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
100
  {
101 6
    if (isset(self::$functionAliases[$name])) {
102 5
      return call_user_func_array(array($this, self::$functionAliases[$name]), $arguments);
103
    }
104
105 1
    throw new BadMethodCallException('Method does not exist: ' . $name);
106
  }
107
108
  /**
109
   * @param $name
110
   * @param $arguments
111
   *
112
   * @return HtmlDomParser
113
   */
114 10
  public static function __callStatic($name, $arguments)
115
  {
116 10
    if ($name == 'str_get_html') {
117 6
      $parser = new HtmlDomParser();
118
119 6
      return $parser->loadHtml($arguments[0]);
120
    }
121
122 4
    if ($name == 'file_get_html') {
123 3
      $parser = new HtmlDomParser();
124
125 3
      return $parser->loadHtmlFile($arguments[0]);
126
    }
127
128 1
    throw new BadMethodCallException('Method does not exist');
129
  }
130
131
  /**
132
   * @param $name
133
   *
134
   * @return string
135
   */
136 9
  public function __get($name)
137
  {
138
    switch ($name) {
139 9
      case 'outertext':
140 6
        return $this->html();
141 3
      case 'innertext':
142 1
        return $this->innerHtml();
143 2
      case 'plaintext':
144 1
        return $this->text();
145
    }
146
147 1
    return null;
148
  }
149
150
  /**
151
   * @param string $selector
152
   * @param int    $idx
153
   *
154
   * @return SimpleHtmlDom|SimpleHtmlDomNode|null
155
   */
156 2
  public function __invoke($selector, $idx = null)
157
  {
158 2
    return $this->find($selector, $idx);
159
  }
160
161
  /**
162
   * @return mixed
163
   */
164 7
  public function __toString()
165
  {
166 7
    return $this->html();
167
  }
168
169
  /**
170
   * does nothing (only for api-compatibility-reasons)
171
   *
172
   * @return bool
173
   */
174 1
  public function clear()
175
  {
176 1
    return true;
177
  }
178
179
  /**
180
   * create DOMDocument from HTML
181
   *
182
   * @param string $html
183
   *
184
   * @return \DOMDocument
185
   */
186 73
  private function createDOMDocument($html)
187
  {
188 73
    if (strpos($html, '<') === false) {
189 3
      $this->isDOMDocumentCreatedWithoutHtml = true;
190 3
    }
191
192 73
    if (strpos($html, '<html') === false) {
193 28
      $this->isDOMDocumentCreatedWithoutHtmlWrapper = true;
194 28
    }
195
196
    // set error level
197 73
    $internalErrors = libxml_use_internal_errors(true);
198 73
    $disableEntityLoader = libxml_disable_entity_loader(true);
199
200 73
    $sxe = simplexml_load_string($html);
201 73
    if (count(libxml_get_errors()) === 0) {
202 26
      $this->document = dom_import_simplexml($sxe)->ownerDocument;
203 26
    } else {
204 48
      $this->document->loadHTML('<?xml encoding="' . $this->getEncoding() . '">' . $html);
205
206
      // remove the "xml-encoding" hack
207 48
      foreach ($this->document->childNodes as $child) {
208 48
        if ($child->nodeType == XML_PI_NODE) {
209 48
          $this->document->removeChild($child);
210 48
        }
211 48
      }
212
213 48
      libxml_clear_errors();
214
    }
215
216
    // set encoding
217 73
    $this->document->encoding = $this->getEncoding();
218
219
    // restore lib-xml settings
220 73
    libxml_use_internal_errors($internalErrors);
221 73
    libxml_disable_entity_loader($disableEntityLoader);
222
223 73
    return $this->document;
224
  }
225
226
  /**
227
   * Callback function for preg_replace_callback use.
228
   *
229
   * @param  array $matches PREG matches
230
   *
231
   * @return string
232
   */
233 1
  protected function entityCallback(&$matches)
234
  {
235 1
    return mb_convert_encoding($matches[0], 'UTF-8', 'HTML-ENTITIES');
236
  }
237
238
  /**
239
   * Return SimpleHtmlDom by id.
240
   *
241
   * @param string $id
242
   *
243
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
244
   */
245 1
  public function getElementById($id)
246
  {
247 1
    return $this->find("#$id", 0);
248
  }
249
250
  /**
251
   * Return SimpleHtmlDom by tag name.
252
   *
253
   * @param string $name
254
   *
255
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
256
   */
257 1
  public function getElementByTagName($name)
258
  {
259 1
    return $this->find($name, 0);
260
  }
261
262
  /**
263
   * Returns Elements by id
264
   *
265
   * @param string   $id
266
   * @param null|int $idx
267
   *
268
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
269
   */
270
  public function getElementsById($id, $idx = null)
271
  {
272
    return $this->find("#$id", $idx);
273
  }
274
275
  /**
276
   * Returns Elements by tag name
277
   *
278
   * @param string   $name
279
   * @param null|int $idx
280
   *
281
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
282
   */
283 1
  public function getElementsByTagName($name, $idx = null)
284
  {
285 1
    return $this->find($name, $idx);
286
  }
287
288
  /**
289
   * Find list of nodes with a CSS selector.
290
   *
291
   * @param string $selector
292
   * @param int    $idx
293
   *
294
   * @return SimpleHtmlDom|SimpleHtmlDom[]
295
   */
296 54
  public function find($selector, $idx = null)
297
  {
298 54
    $xPathQuery = SelectorConverter::toXPath($selector);
299
300 54
    $xPath = new DOMXPath($this->document);
301 54
    $nodesList = $xPath->query($xPathQuery);
302 54
    $elements = new SimpleHtmlDomNode();
303
304 54
    foreach ($nodesList as $node) {
305 52
      $elements[] = new SimpleHtmlDom($node);
306 54
    }
307
308 54 View Code Duplication
    if (null === $idx) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
309 45
      return $elements;
0 ignored issues
show
Bug Best Practice introduced by
The return type of return $elements; (voku\helper\SimpleHtmlDomNode) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\helper\SimpleHtmlDom[].

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
310
    } else {
311 21
      if ($idx < 0) {
312 11
        $idx = count($elements) + $idx;
313 11
      }
314
    }
315
316 21
    if (isset($elements[$idx])) {
317 21
      return $elements[$idx];
318
    } else {
319
      return new SimpleHtmlDomNodeBlank();
0 ignored issues
show
Bug Best Practice introduced by
The return type of return new \voku\helper\SimpleHtmlDomNodeBlank(); (voku\helper\SimpleHtmlDomNodeBlank) is incompatible with the return type documented by voku\helper\HtmlDomParser::find of type voku\helper\SimpleHtmlDo...\helper\SimpleHtmlDom[].

If you return a value from a function or method, it should be a sub-type of the type that is given by the parent type f.e. an interface, or abstract method. This is more formally defined by the Lizkov substitution principle, and guarantees that classes that depend on the parent type can use any instance of a child type interchangably. This principle also belongs to the SOLID principles for object oriented design.

Let’s take a look at an example:

class Author {
    private $name;

    public function __construct($name) {
        $this->name = $name;
    }

    public function getName() {
        return $this->name;
    }
}

abstract class Post {
    public function getAuthor() {
        return 'Johannes';
    }
}

class BlogPost extends Post {
    public function getAuthor() {
        return new Author('Johannes');
    }
}

class ForumPost extends Post { /* ... */ }

function my_function(Post $post) {
    echo strtoupper($post->getAuthor());
}

Our function my_function expects a Post object, and outputs the author of the post. The base class Post returns a simple string and outputting a simple string will work just fine. However, the child class BlogPost which is a sub-type of Post instead decided to return an object, and is therefore violating the SOLID principles. If a BlogPost were passed to my_function, PHP would not complain, but ultimately fail when executing the strtoupper call in its body.

Loading history...
320
    }
321
  }
322
323
  /**
324
   * @param $content
325
   *
326
   * @return mixed
327
   */
328 21
  protected function fixHtmlOutput($content)
329
  {
330
    // INFO: DOMDocument will encapsulate plaintext into a paragraph tag (<p>),
331
    //          so we try to remove it here again ...
332 21
    if ($this->isDOMDocumentCreatedWithoutHtml === true) {
333 3
      $content = str_replace(
334
          array(
335 3
              "\n",
336 3
              '<p>', '</p>',
337 3
              "\n" . '<simpleHtmlDomP>', '<simpleHtmlDomP>', '</simpleHtmlDomP>',
338 3
              '<body>', '</body>',
339 3
              '<html>', '</html>'
340 3
          ),
341 3
          '',
342
          $content
343 3
      );
344
345 21
    } else if ($this->isDOMDocumentCreatedWithoutHtmlWrapper === true) {
346 7
      $content = str_replace(
347
          array(
348 7
              "\n",
349 7
              "\n" . '<simpleHtmlDomP>', '<simpleHtmlDomP>', '</simpleHtmlDomP>',
350 7
              '<body>', '</body>',
351 7
              '<html>', '</html>'
352 7
          ),
353 7
          '',
354
          $content
355 7
      );
356 7
    }
357
358
    // replace html entities which represent UTF-8 codepoints.
359 21
    $content = preg_replace_callback("/&#\d{2,5};/", array($this, 'entityCallback'), $content);
360
361 21
    return urldecode(trim($content));
362
  }
363
364
  /**
365
   * @return DOMDocument
366
   */
367 34
  public function getDocument()
368
  {
369 34
    return $this->document;
370
  }
371
372
  /**
373
   * Get the encoding to use
374
   *
375
   * @return string
376
   */
377 84
  private function getEncoding()
378
  {
379 84
    return $this->encoding;
380
  }
381
382
  /**
383
   * @return bool
384
   */
385 5
  public function getIsDOMDocumentCreatedWithoutHtml()
386
  {
387 5
    return $this->isDOMDocumentCreatedWithoutHtml;
388
  }
389
390
  /**
391
   * @return bool
392
   */
393 18
  public function getIsDOMDocumentCreatedWithoutHtmlWrapper()
394
  {
395 18
    return $this->isDOMDocumentCreatedWithoutHtmlWrapper;
396
  }
397
398
  /**
399
   * Get dom node's outer html
400
   *
401
   * @return string
402
   */
403 18
  public function html()
404
  {
405 18
    if ($this::$callback !== null) {
406
      call_user_func_array($this::$callback, array($this));
407
    }
408
409 18
    if ($this->getIsDOMDocumentCreatedWithoutHtmlWrapper()) {
410 6
      $content = $this->document->saveHTML($this->document->documentElement);
411 6
    } else {
412 15
      $content = $this->document->saveHTML();
413
    }
414
415 18
    return $this->fixHtmlOutput($content);
416
  }
417
418
  /**
419
   * Get dom node's inner html
420
   *
421
   * @return string
422
   */
423 5
  public function innerHtml()
424
  {
425 5
    $text = '';
426
427 5
    foreach ($this->document->documentElement->childNodes as $node) {
428 5
      $text .= $this->fixHtmlOutput($this->document->saveHTML($node));
429 5
    }
430
431 5
    return $text;
432
  }
433
434
  /**
435
   * Load HTML from string
436
   *
437
   * @param string $html
438
   *
439
   * @return HtmlDomParser
440
   *
441
   * @throws InvalidArgumentException if argument is not string
442
   */
443 76
  public function loadHtml($html)
444
  {
445 76
    if (!is_string($html)) {
446 3
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
447
    }
448
449 73
    $this->document = $this->createDOMDocument($html);
450
451 73
    return $this;
452
  }
453
454
  /**
455
   * Load HTML from file
456
   *
457
   * @param string $filePath
458
   *
459
   * @return HtmlDomParser
460
   */
461 7
  public function loadHtmlFile($filePath)
462
  {
463 7
    if (!is_string($filePath)) {
464 2
      throw new InvalidArgumentException(__METHOD__ . ' expects parameter 1 to be string.');
465
    }
466
467 5
    if (!preg_match("/^https?:\/\//i", $filePath) && !file_exists($filePath)) {
468 1
      throw new RuntimeException("File $filePath not found");
469
    }
470
471
    try {
472 4
      $html = file_get_contents($filePath);
473 4
    } catch (\Exception $e) {
474 1
      throw new RuntimeException("Could not load file $filePath");
475
    }
476
477 3
    if ($html === false) {
478
      throw new RuntimeException("Could not load file $filePath");
479
    }
480
481 3
    $this->loadHtml($html);
482
483 3
    return $this;
484
  }
485
486
  /**
487
   * Save dom as string
488
   *
489
   * @param string $filepath
490
   *
491
   * @return string
492
   */
493 1
  public function save($filepath = '')
494
  {
495 1
    $string = $this->innerHtml();
496 1
    if ($filepath !== '') {
497
      file_put_contents($filepath, $string, LOCK_EX);
498
    }
499
500 1
    return $string;
501
  }
502
503
  /**
504
   * @param $functionName
505
   */
506
  public function set_callback($functionName)
507
  {
508
    $this::$callback = $functionName;
509
  }
510
511
  /**
512
   * Get dom node's plain text
513
   *
514
   * @return string
515
   */
516 1
  public function text()
517
  {
518 1
    return $this->document->textContent;
519
  }
520
}
521