Completed
Push — master ( 57d66c...0d0990 )
by Lars
02:34
created

SimpleHtmlDom::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 7
Bugs 0 Features 0
Metric Value
c 7
b 0
f 0
dl 0
loc 4
ccs 3
cts 3
cp 1
rs 10
cc 1
eloc 2
nc 1
nop 1
crap 1
1
<?php
2
3
namespace voku\helper;
4
5
use BadMethodCallException;
6
use DOMElement;
7
use DOMNode;
8
use RuntimeException;
9
10
/**
11
 * Class SimpleHtmlDom
12
 *
13
 * @package voku\helper
14
 *
15
 * @property string outerText Get dom node's outer html (alias for "outerHtml")
16
 * @property string outerHtml Get dom node's outer html
17
 * @property string innerText Get dom node's inner html (alias for "innerHtml")
18
 * @property string innerHtml Get dom node's inner html
19
 * @property-read string plaintext Get dom node's plain text
20
 * @property-read string tag       Get dom node name
21
 * @property-read string attr      Get dom node attributes
22
 *
23
 * @method SimpleHtmlDomNode|SimpleHtmlDom|null children() children($idx = -1) Returns children of node
24
 * @method SimpleHtmlDom|null first_child() Returns the first child of node
25
 * @method SimpleHtmlDom|null last_child() Returns the last child of node
26
 * @method SimpleHtmlDom|null next_sibling() Returns the next sibling of node
27
 * @method SimpleHtmlDom|null prev_sibling() Returns the previous sibling of node
28
 * @method SimpleHtmlDom|null parent() Returns the parent of node
29
 * @method string outerText() Get dom node's outer html (alias for "outerHtml()")
30
 * @method string outerHtml() Get dom node's outer html
31
 * @method string innerText() Get dom node's inner html (alias for "innerHtml()")
32
 *
33
 */
34
class SimpleHtmlDom implements \IteratorAggregate
35
{
36
  /**
37
   * @var array
38
   */
39
  protected static $functionAliases = array(
40
      'children'     => 'childNodes',
41
      'first_child'  => 'firstChild',
42
      'last_child'   => 'lastChild',
43
      'next_sibling' => 'nextSibling',
44
      'prev_sibling' => 'previousSibling',
45
      'parent'       => 'parentNode',
46
      'outertext'    => 'html',
47
      'outerhtml'    => 'html',
48
      'innertext'    => 'innerHtml',
49
      'innerhtml'    => 'innerHtml',
50
  );
51
52
  /**
53
   * @var DOMElement
54
   */
55
  protected $node;
56
57
  /**
58
   * SimpleHtmlDom constructor.
59
   *
60
   * @param DOMNode $node
61
   */
62 88
  public function __construct(DOMNode $node)
63
  {
64 88
    $this->node = $node;
0 ignored issues
show
Documentation Bug introduced by
$node is of type object<DOMNode>, but the property $node was declared to be of type object<DOMElement>. Are you sure that you always receive this specific sub-class here, or does it make sense to add an instanceof check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a given class or a super-class is assigned to a property that is type hinted more strictly.

Either this assignment is in error or an instanceof check should be added for that assignment.

class Alien {}

class Dalek extends Alien {}

class Plot
{
    /** @var  Dalek */
    public $villain;
}

$alien = new Alien();
$plot = new Plot();
if ($alien instanceof Dalek) {
    $plot->villain = $alien;
}
Loading history...
65 88
  }
66
67
  /**
68
   * @param $name
69
   * @param $arguments
70
   *
71
   * @return null|string|SimpleHtmlDom
72
   *
73
   */
74 8
  public function __call($name, $arguments)
75
  {
76 8
    if (isset(self::$functionAliases[$name])) {
77 8
      return call_user_func_array(array($this, self::$functionAliases[$name]), $arguments);
78
    }
79
80
    throw new BadMethodCallException('Method does not exist');
81
  }
82
83
  /**
84
   * @param $name
85
   *
86
   * @return array|null|string
87
   */
88 39
  public function __get($name)
89
  {
90 39
    $name = strtolower($name);
91
92
    switch ($name) {
93 39
      case 'outerhtml':
94 35
      case 'outertext':
95 17
        return $this->html();
96 29
      case 'innerhtml':
97 24
      case 'innertext':
98 8
        return $this->innerHtml();
99 23
      case 'text':
100 18
      case 'plaintext':
101 14
        return $this->text();
102 10
      case 'tag':
103 4
        return $this->node->nodeName;
104 9
      case 'attr':
105
        return $this->getAllAttributes();
106
      default:
107 9
        return $this->getAttribute($name);
108
    }
109
  }
110
111
  /**
112
   * @param string $selector
113
   * @param int    $idx
114
   *
115
   * @return SimpleHtmlDom|SimpleHtmlDomNode|null
116
   */
117 12
  public function __invoke($selector, $idx = null)
118
  {
119 12
    return $this->find($selector, $idx);
120
  }
121
122
  /**
123
   * @param $name
124
   *
125
   * @return bool
126
   */
127 1
  public function __isset($name)
128
  {
129
    switch ($name) {
130 1
      case 'outertext':
131 1
      case 'innertext':
132 1
      case 'plaintext':
133 1
      case 'text':
134 1
      case 'tag':
135
        return true;
136
      default:
137 1
        return $this->hasAttribute($name);
138
    }
139
  }
140
141
  /**
142
   * @param $name
143
   * @param $value
144
   *
145
   * @return SimpleHtmlDom
146
   */
147 10
  public function __set($name, $value)
148
  {
149
    switch ($name) {
150 10
      case 'outertext':
151 4
        return $this->replaceNode($value);
152 6
      case 'innertext':
153 3
        return $this->replaceChild($value);
154
      default:
155 5
        return $this->setAttribute($name, $value);
156
    }
157
  }
158
159
  /**
160
   * @return string
161
   */
162 3
  public function __toString()
163
  {
164 3
    return $this->html();
165
  }
166
167
  /**
168
   * @param $name
169
   *
170
   * @return SimpleHtmlDom
171
   */
172 1
  public function __unset($name)
173
  {
174 1
    return $this->setAttribute($name, null);
175
  }
176
177
  /**
178
   * Returns children of node
179
   *
180
   * @param int $idx
181
   *
182
   * @return SimpleHtmlDomNode|SimpleHtmlDom|null
183
   */
184 2
  public function childNodes($idx = -1)
185
  {
186 2
    $nodeList = $this->getIterator();
187
188 2
    if ($idx === -1) {
189 2
      return $nodeList;
190
    }
191
192 2
    if (isset($nodeList[$idx])) {
193 2
      return $nodeList[$idx];
194
    }
195
196 1
    return null;
197
  }
198
199
  /**
200
   * Find list of nodes with a CSS selector
201
   *
202
   * @param string $selector
203
   * @param int    $idx
204
   *
205
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
206
   */
207 25
  public function find($selector, $idx = null)
208
  {
209 25
    return $this->getHtmlDomParser()->find($selector, $idx);
210
  }
211
212
  /**
213
   * Returns the first child of node
214
   *
215
   * @return SimpleHtmlDom|null
216
   */
217 4
  public function firstChild()
218
  {
219 4
    $node = $this->node->firstChild;
220
221 4
    if ($node === null) {
222 1
      return null;
223
    }
224
225 4
    return new self($node);
226
  }
227
228
  /**
229
   * Returns array of attributes
230
   *
231
   * @return array|null
232
   */
233 1
  public function getAllAttributes()
234
  {
235 1
    if ($this->node->hasAttributes()) {
236 1
      $attributes = array();
237 1
      foreach ($this->node->attributes as $attr) {
238 1
        $attributes[$attr->name] = HtmlDomParser::putReplacedBackToPreserveHtmlEntities($attr->value);
239
      }
240
241 1
      return $attributes;
242
    }
243
244 1
    return null;
245
  }
246
247
  /**
248
   * Return attribute value
249
   *
250
   * @param string $name
251
   *
252
   * @return string
253
   */
254 12
  public function getAttribute($name)
255
  {
256 12
    $html = $this->node->getAttribute($name);
257
258 12
    return HtmlDomParser::putReplacedBackToPreserveHtmlEntities($html);
259
  }
260
261
  /**
262
   * Return SimpleHtmlDom by id.
263
   *
264
   * @param string $id
265
   *
266
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
267
   */
268 1
  public function getElementById($id)
269
  {
270 1
    return $this->find("#$id", 0);
271
  }
272
273
  /**
274
   * Return SimpleHtmlDom by tag name.
275
   *
276
   * @param string $name
277
   *
278
   * @return SimpleHtmlDom|SimpleHtmlDomNodeBlank
279
   */
280 1
  public function getElementByTagName($name)
281
  {
282 1
    $node = $this->node->getElementsByTagName($name)->item(0);
283
284 1
    if ($node !== null) {
285 1
      return new self($node);
286
    } else {
287
      return new SimpleHtmlDomNodeBlank();
288
    }
289
  }
290
291
  /**
292
   * Returns Elements by id
293
   *
294
   * @param string   $id
295
   * @param null|int $idx
296
   *
297
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
298
   */
299
  public function getElementsById($id, $idx = null)
300
  {
301
    return $this->find("#$id", $idx);
302
  }
303
304
  /**
305
   * Returns Elements by tag name
306
   *
307
   * @param string   $name
308
   * @param null|int $idx
309
   *
310
   * @return SimpleHtmlDomNode|SimpleHtmlDomNode[]|SimpleHtmlDomNodeBlank
311
   */
312 1 View Code Duplication
  public function getElementsByTagName($name, $idx = null)
0 ignored issues
show
Duplication introduced by
This method seems to be duplicated in your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
313
  {
314 1
    $nodesList = $this->node->getElementsByTagName($name);
315
316 1
    $elements = new SimpleHtmlDomNode();
317
318 1
    foreach ($nodesList as $node) {
319 1
      $elements[] = new self($node);
320
    }
321
322 1
    if (null === $idx) {
323 1
      return $elements;
324
    } else {
325
      if ($idx < 0) {
326
        $idx = count($elements) + $idx;
327
      }
328
    }
329
330
    if (isset($elements[$idx])) {
331
      return $elements[$idx];
332
    } else {
333
      return new SimpleHtmlDomNodeBlank();
334
    }
335
  }
336
337
  /**
338
   * Create a new "HtmlDomParser"-object from the current context.
339
   *
340
   * @return HtmlDomParser
341
   */
342 51
  public function getHtmlDomParser()
343
  {
344 51
    return new HtmlDomParser($this);
345
  }
346
347
  /**
348
   * Retrieve an external iterator
349
   *
350
   * @link  http://php.net/manual/en/iteratoraggregate.getiterator.php
351
   * @return SimpleHtmlDomNode An instance of an object implementing <b>Iterator</b> or
352
   * <b>Traversable</b>
353
   */
354 2
  public function getIterator()
355
  {
356 2
    $elements = new SimpleHtmlDomNode();
357 2
    if ($this->node->hasChildNodes()) {
358 2
      foreach ($this->node->childNodes as $node) {
359 2
        $elements[] = new self($node);
360
      }
361
    }
362
363 2
    return $elements;
364
  }
365
366
  /**
367
   * @return DOMNode
368
   */
369 52
  public function getNode()
370
  {
371 52
    return $this->node;
372
  }
373
374
  /**
375
   * Determine if an attribute exists on the element.
376
   *
377
   * @param $name
378
   *
379
   * @return bool
380
   */
381 1
  public function hasAttribute($name)
382
  {
383 1
    return $this->node->hasAttribute($name);
384
  }
385
386
  /**
387
   * Get dom node's outer html
388
   *
389
   * @return string
390
   */
391 19
  public function html()
392
  {
393 19
    return $this->getHtmlDomParser()->html();
394
  }
395
396
  /**
397
   * Get dom node's inner html
398
   *
399
   * @return string
400
   */
401 8
  public function innerHtml()
402
  {
403 8
    return $this->getHtmlDomParser()->innerHtml();
404
  }
405
406
  /**
407
   * Returns the last child of node
408
   *
409
   * @return SimpleHtmlDom|null
410
   */
411 4
  public function lastChild()
412
  {
413 4
    $node = $this->node->lastChild;
414
415 4
    if ($node === null) {
416 1
      return null;
417
    }
418
419 4
    return new self($node);
420
  }
421
422
  /**
423
   * Returns the next sibling of node
424
   *
425
   * @return SimpleHtmlDom|null
426
   */
427 1
  public function nextSibling()
428
  {
429 1
    $node = $this->node->nextSibling;
430
431 1
    if ($node === null) {
432 1
      return null;
433
    }
434
435 1
    return new self($node);
436
  }
437
438
  /**
439
   * Returns the parent of node
440
   *
441
   * @return SimpleHtmlDom
442
   */
443 1
  public function parentNode()
444
  {
445 1
    return new self($this->node->parentNode);
446
  }
447
448
  /**
449
   * Returns the previous sibling of node
450
   *
451
   * @return SimpleHtmlDom|null
452
   */
453 1
  public function previousSibling()
454
  {
455 1
    $node = $this->node->previousSibling;
456
457 1
    if ($node === null) {
458 1
      return null;
459
    }
460
461 1
    return new self($node);
462
  }
463
464
  /**
465
   * Replace child node
466
   *
467
   * @param $string
468
   *
469
   * @return $this
470
   */
471 3
  protected function replaceChild($string)
472
  {
473 3
    if (!empty($string)) {
474 3
      $newDocument = new HtmlDomParser($string);
475
476 3
      if ($this->normalizeStringForComparision($newDocument->outertext) != $this->normalizeStringForComparision($string)) {
0 ignored issues
show
Bug introduced by
The property outertext does not seem to exist. Did you mean outerText?

An attempt at access to an undefined property has been detected. This may either be a typographical error or the property has been renamed but there are still references to its old name.

If you really want to allow access to undefined properties, you can define magic methods to allow access. See the php core documentation on Overloading.

Loading history...
477
        throw new RuntimeException('Not valid HTML fragment');
478
      }
479
    }
480
481 3
    foreach ($this->node->childNodes as $node) {
482 3
      $this->node->removeChild($node);
483
    }
484
485 3
    if (!empty($newDocument)) {
486
487 3
      $newDocument = $this->cleanHtmlWrapper($newDocument);
488
489 3
      $newNode = $this->node->ownerDocument->importNode($newDocument->getDocument()->documentElement, true);
490
491 3
      $this->node->appendChild($newNode);
492
    }
493
494 3
    return $this;
495
  }
496
497
  /**
498
   * Replace this node
499
   *
500
   * @param $string
501
   *
502
   * @return $this
503
   */
504 4
  protected function replaceNode($string)
505
  {
506 4
    if (empty($string)) {
507 2
      $this->node->parentNode->removeChild($this->node);
508
509 2
      return null;
510
    }
511
512 3
    $newDocument = new HtmlDomParser($string);
513
514
    // DEBUG
515
    //echo $this->normalizeStringForComparision($newDocument->outertext) . "\n";
0 ignored issues
show
Unused Code Comprehensibility introduced by
60% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
516
    //echo $this->normalizeStringForComparision($string) . "\n\n";
0 ignored issues
show
Unused Code Comprehensibility introduced by
62% of this comment could be valid code. Did you maybe forget this after debugging?

Sometimes obsolete code just ends up commented out instead of removed. In this case it is better to remove the code once you have checked you do not need it.

The code might also have been commented out for debugging purposes. In this case it is vital that someone uncomments it again or your project may behave in very unexpected ways in production.

This check looks for comments that seem to be mostly valid code and reports them.

Loading history...
517
518 3
    if ($this->normalizeStringForComparision($newDocument->outertext) != $this->normalizeStringForComparision($string)) {
0 ignored issues
show
Bug introduced by
The property outertext does not seem to exist. Did you mean outerText?

An attempt at access to an undefined property has been detected. This may either be a typographical error or the property has been renamed but there are still references to its old name.

If you really want to allow access to undefined properties, you can define magic methods to allow access. See the php core documentation on Overloading.

Loading history...
519
      throw new RuntimeException('Not valid HTML fragment');
520
    }
521
522 3
    $newDocument = $this->cleanHtmlWrapper($newDocument);
523
524 3
    $newNode = $this->node->ownerDocument->importNode($newDocument->getDocument()->documentElement, true);
525
526 3
    $this->node->parentNode->replaceChild($newNode, $this->node);
527 3
    $this->node = $newNode;
528
529 3
    return $this;
530
  }
531
532
  /**
533
   * Normalize the given string for comparision.
534
   *
535
   * @param $string
536
   *
537
   * @return string
538
   */
539 6
  private function normalizeStringForComparision($string)
540
  {
541 6
    return urlencode(urldecode(trim(str_replace(array(' ', "\n", "\r\n", "\r"), '', strtolower($string)))));
542
  }
543
544
  /**
545
   * @param HtmlDomParser $newDocument
546
   *
547
   * @return HtmlDomParser
548
   */
549 6
  protected function cleanHtmlWrapper(HtmlDomParser $newDocument)
550
  {
551 6
    if ($newDocument->getIsDOMDocumentCreatedWithoutHtml() === true) {
552
553
      // Remove doc-type node.
554 3
      $newDocument->getDocument()->doctype->parentNode->removeChild($newDocument->getDocument()->doctype);
555
556
      // Remove html element, preserving child nodes.
557 3
      $html = $newDocument->getDocument()->getElementsByTagName('html')->item(0);
558 3
      $fragment = $newDocument->getDocument()->createDocumentFragment();
559 3
      while ($html->childNodes->length > 0) {
560 3
        $fragment->appendChild($html->childNodes->item(0));
561
      }
562 3
      $html->parentNode->replaceChild($fragment, $html);
563
564
      // Remove body element, preserving child nodes.
565 3
      $body = $newDocument->getDocument()->getElementsByTagName('body')->item(0);
566 3
      $fragment = $newDocument->getDocument()->createDocumentFragment();
567 3
      while ($body->childNodes->length > 0) {
568 3
        $fragment->appendChild($body->childNodes->item(0));
569
      }
570 3
      $body->parentNode->replaceChild($fragment, $body);
571
572
      // At this point DOMDocument still added a "<p>"-wrapper around our string,
573
      // so we replace it with "<simpleHtmlDomP>" and delete this at the ending ...
574 3
      $this->changeElementName($newDocument->getDocument()->getElementsByTagName('p')->item(0), 'simpleHtmlDomP');
575
    }
576
577 6
    return $newDocument;
578
  }
579
580
  /**
581
   * change the name of a tag in a "DOMNode"
582
   *
583
   * @param DOMNode $node
584
   * @param string  $name
585
   *
586
   * @return DOMElement
587
   */
588 3
  protected function changeElementName(\DOMNode $node, $name)
589
  {
590 3
    $newnode = $node->ownerDocument->createElement($name);
591 3
    foreach ($node->childNodes as $child) {
592 3
      $child = $node->ownerDocument->importNode($child, true);
593 3
      $newnode->appendChild($child);
594
    }
595 3
    foreach ($node->attributes as $attrName => $attrNode) {
596
      $newnode->setAttribute($attrName, $attrNode);
597
    }
598 3
    $newnode->ownerDocument->replaceChild($newnode, $node);
599
600 3
    return $newnode;
601
  }
602
603
  /**
604
   * Set attribute value
605
   *
606
   * @param $name
607
   * @param $value
608
   *
609
   * @return $this
610
   */
611 5
  public function setAttribute($name, $value)
612
  {
613 5
    if (empty($value)) {
614 1
      $this->node->removeAttribute($name);
615
    } else {
616 5
      $this->node->setAttribute($name, $value);
617
    }
618
619 5
    return $this;
620
  }
621
622
  /**
623
   * Get dom node's plain text
624
   *
625
   * @return string
626
   */
627 14
  public function text()
628
  {
629 14
    return $this->node->textContent;
630
  }
631
}
632