Issues (994)

src/simplehtmldom/HtmlDocument.php (9 issues)

1
<?php
2
3
namespace simplehtmldom;
4
5
/**
6
 * Website: http://sourceforge.net/projects/simplehtmldom/
7
 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/).
8
 *
9
 * Licensed under The MIT License
10
 * See the LICENSE file in the project root for more information.
11
 *
12
 * Authors:
13
 *   S.C. Chen
14
 *   John Schlick
15
 *   Rus Carroll
16
 *   logmanoriginal
17
 *
18
 * Contributors:
19
 *   Yousuke Kumakura
20
 *   Vadim Voituk
21
 *   Antcs
22
 *
23
 * Version Rev. 2.0-RC2 (415)
24
 */
25
include_once __DIR__ . '/constants.php';
26
include_once __DIR__ . '/HtmlNode.php';
27
include_once __DIR__ . '/Debug.php';
28
29
/**
30
 * HTMLDocument class.
31
 */
32
class HtmlDocument
33
{
34
  /**
35
   * HtmlNode instance.
36
   *
37
   * @var HtmlNode
38
   */
39
  public $root = null;
40
  public $nodes = [];
41
  public $callback = null;
42
  public $lowercase = false;
43
  public $original_size;
44
  public $size;
45
46
  protected $pos;
47
  protected $doc;
48
  protected $char;
49
50
  protected $cursor;
51
  protected $parent;
52
  protected $noise = [];
53
  protected $token_blank = " \t\r\n";
54
  protected $token_equal = ' =/>';
55
  protected $token_slash = " />\r\n\t";
56
  protected $token_attr = ' >';
57
58
  public $_charset = '';
59
  public $_target_charset = '';
60
61
  public $default_br_text = '';
62
  public $default_span_text = '';
63
64
  protected $self_closing_tags = [
65
    'area' => 1,
66
    'base' => 1,
67
    'br' => 1,
68
    'col' => 1,
69
    'embed' => 1,
70
    'hr' => 1,
71
    'img' => 1,
72
    'input' => 1,
73
    'link' => 1,
74
    'meta' => 1,
75
    'param' => 1,
76
    'source' => 1,
77
    'track' => 1,
78
    'wbr' => 1,
79
  ];
80
  protected $block_tags = [
81
    'body' => 1,
82
    'div' => 1,
83
    'form' => 1,
84
    'root' => 1,
85
    'span' => 1,
86
    'table' => 1,
87
  ];
88
  protected $optional_closing_tags = [
89
    // Not optional, see
90
    // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
91
    'b' => ['b' => 1],
92
    'dd' => ['dd' => 1, 'dt' => 1],
93
    // Not optional, see
94
    // https://www.w3.org/TR/html/grouping-content.html#the-dl-element
95
    'dl' => ['dd' => 1, 'dt' => 1],
96
    'dt' => ['dd' => 1, 'dt' => 1],
97
    'li' => ['li' => 1],
98
    'optgroup' => ['optgroup' => 1, 'option' => 1],
99
    'option' => ['optgroup' => 1, 'option' => 1],
100
    'p' => ['p' => 1],
101
    'rp' => ['rp' => 1, 'rt' => 1],
102
    'rt' => ['rp' => 1, 'rt' => 1],
103
    'td' => ['td' => 1, 'th' => 1],
104
    'th' => ['td' => 1, 'th' => 1],
105
    'tr' => ['td' => 1, 'th' => 1, 'tr' => 1],
106
  ];
107
108
  public function __call($func, $args)
109
  {
110
    // Allow users to call methods with lower_case syntax
111
    switch ($func) {
112
      case 'load_file':
113
        $actual_function = 'loadFile';
114
        break;
115
      case 'clear':
116
        return; /* no-op */
117
      default:
118
        trigger_error(
119
          'Call to undefined method ' . __CLASS__ . '::' . $func . '()',
120
          E_USER_ERROR
121
        );
122
    }
123
124
    // phpcs:ignore Generic.Files.LineLength
125
    Debug::log(__CLASS__ . '->' . $func . '() has been deprecated and will be removed in the next major version of simplehtmldom. Use ' . __CLASS__ . '->' . $actual_function . '() instead.');
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $actual_function does not seem to be defined for all execution paths leading up to this point.
Loading history...
126
127
    return call_user_func_array([$this, $actual_function], $args);
128
  }
129
130
  public function __construct(
131
    $str = null,
132
    $lowercase = true,
133
    $forceTagsClosed = true,
134
    $target_charset = DEFAULT_TARGET_CHARSET,
135
    $stripRN = true,
136
    $defaultBRText = DEFAULT_BR_TEXT,
137
    $defaultSpanText = DEFAULT_SPAN_TEXT,
138
    $options = 0
139
  ) {
140
    if ($str) {
141
      if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
142
        $this->load_file($str);
0 ignored issues
show
The method load_file() does not exist on simplehtmldom\HtmlDocument. Since you implemented __call, consider adding a @method annotation. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

142
        $this->/** @scrutinizer ignore-call */ 
143
               load_file($str);
Loading history...
143
      } else {
144
        $this->load(
145
          $str,
146
          $lowercase,
147
          $stripRN,
148
          $defaultBRText,
149
          $defaultSpanText,
150
          $options
151
        );
152
      }
153
    } else {
154
      $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
155
    }
156
    // Forcing tags to be closed implies that we don't trust the html, but
157
    // it can lead to parsing errors if we SHOULD trust the html.
158
    if (!$forceTagsClosed) {
159
      $this->optional_closing_array = [];
0 ignored issues
show
Bug Best Practice introduced by
The property optional_closing_array does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
160
    }
161
162
    $this->_target_charset = $target_charset;
163
  }
164
165
  public function __debugInfo()
166
  {
167
    return [
168
      'root' => $this->root,
169
      'noise' => empty($this->noise) ? 'none' : $this->noise,
170
      'charset' => $this->_charset,
171
      'target charset' => $this->_target_charset,
172
      'original size' => $this->original_size,
173
    ];
174
  }
175
176
  public function __destruct()
177
  {
178
    if (isset($this->nodes)) {
179
      foreach ($this->nodes as $n) {
180
        $n->clear();
181
      }
182
    }
183
  }
184
185
  public function load(
186
    $str,
187
    $lowercase = true,
188
    $stripRN = true,
189
    $defaultBRText = DEFAULT_BR_TEXT,
190
    $defaultSpanText = DEFAULT_SPAN_TEXT,
191
    $options = 0
192
  ) {
193
    // prepare
194
    $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
195
196
    if ($stripRN) {
197
      // Temporarily remove any element that shouldn't loose whitespace
198
      $this->remove_noise("'<\s*script[^>]*>(.*?)<\s*/\s*script\s*>'is");
199
      $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is");
200
      $this->remove_noise("'<!--(.*?)-->'is");
201
      $this->remove_noise("'<\s*style[^>]*>(.*?)<\s*/\s*style\s*>'is");
202
      $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
203
204
      // Remove whitespace and newlines between tags
205
      $this->doc = preg_replace('/\>([\t\s]*[\r\n]^[\t\s]*)\</m', '><', $this->doc);
206
207
      // Remove whitespace and newlines in text
208
      $this->doc = preg_replace('/([\t\s]*[\r\n]^[\t\s]*)/m', ' ', $this->doc);
209
210
      // Restore temporarily removed elements and calculate new size
211
      $this->doc = $this->restore_noise($this->doc);
212
      $this->size = strlen($this->doc);
213
    }
214
215
    $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); // server-side script
216
    if (count($this->noise)) {
217
      // phpcs:ignore Generic.Files.LineLength
218
      Debug::log('Support for server-side scripts has been deprecated and will be removed in the next major version of simplehtmldom.');
219
    }
220
221
    if ($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
222
      $this->remove_noise("'(\{\w)(.*?)(\})'s", true);
223
      // phpcs:ignore Generic.Files.LineLength
224
      Debug::log('Support for Smarty scripts has been deprecated and will be removed in the next major version of simplehtmldom.');
225
    }
226
227
    // parsing
228
    $this->parse($stripRN);
229
    // end
230
    $this->root->_[HtmlNode::HDOM_INFO_END] = $this->cursor;
231
    $this->parse_charset();
232
    $this->decode();
233
    unset($this->doc);
234
235
    // make load function chainable
236
    return $this;
237
  }
238
239
  public function set_callback($function_name)
240
  {
241
    $this->callback = $function_name;
242
  }
243
244
  public function remove_callback()
245
  {
246
    $this->callback = null;
247
  }
248
249
  /**
250
   * Save modified html.
251
   *
252
   * @param string $filepath
253
   */
254
  public function save($filepath = '')
255
  {
256
    $ret = $this->root->innertext();
257
    if ('' !== $filepath) {
258
      file_put_contents($filepath, $ret, LOCK_EX);
259
    }
260
261
    return $ret;
262
  }
263
264
  /**
265
   * Find elements by CSS Selector.
266
   *
267
   * @param string      $selector  CSS Selector
268
   * @param number|null $idx
269
   * @param bool        $lowercase
270
   *
271
   * @return HtmlNode[]|HtmlNode
272
   */
273
  public function find($selector, $idx = null, $lowercase = false)
274
  {
275
    return $this->root->find($selector, $idx, $lowercase);
0 ignored issues
show
It seems like $idx can also be of type double; however, parameter $idx of simplehtmldom\HtmlNode::find() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

275
    return $this->root->find($selector, /** @scrutinizer ignore-type */ $idx, $lowercase);
Loading history...
276
  }
277
278
  public function title()
279
  {
280
    $title = $this->find('title', 0);
281
    return $title ? $title->innertext : null;
0 ignored issues
show
$title is of type simplehtmldom\HtmlNode, thus it always evaluated to true.
Loading history...
282
  }
283
284
  public function expect($selector, $idx = null, $lowercase = false)
285
  {
286
    return $this->root->expect($selector, $idx, $lowercase);
287
  }
288
289
  /** @codeCoverageIgnore */
290
  public function dump($show_attr = true)
291
  {
292
    $this->root->dump($show_attr);
293
  }
294
295
  protected function prepare(
296
    $str,
297
    $lowercase = true,
298
    $defaultBRText = DEFAULT_BR_TEXT,
299
    $defaultSpanText = DEFAULT_SPAN_TEXT
300
  ) {
301
    $this->clear();
0 ignored issues
show
The method clear() does not exist on simplehtmldom\HtmlDocument. Since you implemented __call, consider adding a @method annotation. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

301
    $this->/** @scrutinizer ignore-call */ 
302
           clear();
Loading history...
302
303
    $this->doc = trim($str);
304
    $this->size = strlen($this->doc);
305
    $this->original_size = $this->size; // original size of the html
306
    $this->pos = 0;
307
    $this->cursor = 1;
308
    $this->noise = [];
309
    $this->nodes = [];
310
    $this->lowercase = $lowercase;
311
    $this->default_br_text = $defaultBRText;
312
    $this->default_span_text = $defaultSpanText;
313
    $this->root = new HtmlNode($this);
314
    $this->root->tag = 'root';
315
    $this->root->_[HtmlNode::HDOM_INFO_BEGIN] = -1;
316
    $this->root->nodetype = HtmlNode::HDOM_TYPE_ROOT;
317
    $this->parent = $this->root;
318
    if ($this->size > 0) {
319
      $this->char = $this->doc[0];
320
    }
321
  }
322
323
  protected function decode()
324
  {
325
    foreach ($this->nodes as $node) {
326
      if (isset($node->_[HtmlNode::HDOM_INFO_TEXT])) {
327
        $node->_[HtmlNode::HDOM_INFO_TEXT] = html_entity_decode(
328
          $this->restore_noise($node->_[HtmlNode::HDOM_INFO_TEXT]),
329
          ENT_QUOTES | ENT_HTML5,
330
          $this->_target_charset
331
        );
332
      }
333
      if (isset($node->_[HtmlNode::HDOM_INFO_INNER])) {
334
        $node->_[HtmlNode::HDOM_INFO_INNER] = html_entity_decode(
335
          $this->restore_noise($node->_[HtmlNode::HDOM_INFO_INNER]),
336
          ENT_QUOTES | ENT_HTML5,
337
          $this->_target_charset
338
        );
339
      }
340
      if (isset($node->attr) && is_array($node->attr)) {
341
        foreach ($node->attr as $a => $v) {
342
          if (true === $v) {
343
            continue;
344
          }
345
          $node->attr[$a] = html_entity_decode(
346
            $v,
347
            ENT_QUOTES | ENT_HTML5,
348
            $this->_target_charset
349
          );
350
        }
351
      }
352
    }
353
  }
354
355
  protected function parse($trim = false)
356
  {
357
    while (true) {
358
      if ('<' !== $this->char) {
359
        $content = $this->copy_until_char('<');
360
361
        if ('' !== $content) {
362
          // Skip whitespace between tags? (</a> <b>)
363
          if ($trim && '' === trim($content)) {
364
            continue;
365
          }
366
367
          $node = new HtmlNode($this);
368
          ++$this->cursor;
369
          $node->_[HtmlNode::HDOM_INFO_TEXT] = $content;
370
          $this->link_nodes($node, false);
371
        }
372
      }
373
374
      if (false === $this->read_tag($trim)) {
375
        break;
376
      }
377
    }
378
  }
379
380
  protected function parse_charset()
381
  {
382
    $charset = null;
383
384
    if (function_exists('get_last_retrieve_url_contents_content_type')) {
385
      $contentTypeHeader = call_user_func('get_last_retrieve_url_contents_content_type');
386
      $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
387
      if ($success) {
388
        $charset = $matches[1];
389
      }
390
391
      // phpcs:ignore Generic.Files.LineLength
392
      Debug::log('Determining charset using get_last_retrieve_url_contents_content_type() ' . ($success ? 'successful' : 'failed'));
393
    }
394
395
    if (empty($charset)) {
396
      // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type
397
      $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
398
399
      if (!empty($el)) {
400
        $fullvalue = $el->content;
0 ignored issues
show
Bug Best Practice introduced by
The property content does not exist on simplehtmldom\HtmlNode. Since you implemented __get, consider adding a @property annotation.
Loading history...
401
402
        if (!empty($fullvalue)) {
403
          $success = preg_match(
404
            '/charset=(.+)/i',
405
            $fullvalue,
406
            $matches
407
          );
408
409
          if ($success) {
410
            $charset = $matches[1];
411
          }
412
        }
413
      }
414
    }
415
416
    if (empty($charset)) {
417
      // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration
418
      if ($meta = $this->root->find('meta[charset]', 0)) {
419
        $charset = $meta->charset;
0 ignored issues
show
Bug Best Practice introduced by
The property charset does not exist on simplehtmldom\HtmlNode. Since you implemented __get, consider adding a @property annotation.
Loading history...
420
      }
421
    }
422
423
    if (empty($charset)) {
424
      // Try to guess the charset based on the content
425
      // Requires Multibyte String (mbstring) support (optional)
426
      if (function_exists('mb_detect_encoding')) {
427
        /**
428
         * mb_detect_encoding() is not intended to distinguish between
429
         * charsets, especially single-byte charsets. Its primary
430
         * purpose is to detect which multibyte encoding is in use,
431
         * i.e. UTF-8, UTF-16, shift-JIS, etc.
432
         *
433
         * -- https://bugs.php.net/bug.php?id=38138
434
         *
435
         * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will
436
         * always result in CP1251/ISO-8859-5 and vice versa.
437
         *
438
         * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1
439
         * to stay compatible.
440
         */
441
        $encoding = mb_detect_encoding(
442
          $this->doc,
443
          ['UTF-8', 'CP1252', 'ISO-8859-1']
444
        );
445
446
        if ('CP1252' === $encoding || 'ISO-8859-1' === $encoding) {
447
          // Due to a limitation of mb_detect_encoding
448
          // 'CP1251'/'ISO-8859-5' will be detected as
449
          // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in
450
          // which case we can simply assume it is the other charset.
451
          if (!@iconv('CP1252', 'UTF-8', $this->doc)) {
452
            $encoding = 'CP1251';
453
          }
454
        }
455
456
        if (false !== $encoding) {
0 ignored issues
show
The condition false !== $encoding is always true.
Loading history...
457
          $charset = $encoding;
458
        }
459
      }
460
    }
461
462
    if (empty($charset)) {
463
      Debug::log('Unable to determine charset from source document. Assuming UTF-8');
464
      $charset = 'UTF-8';
465
    }
466
467
    // Since CP1252 is a superset, if we get one of it's subsets, we want
468
    // it instead.
469
    if (('iso-8859-1' == strtolower($charset))
470
      || ('latin1' == strtolower($charset))
471
      || ('latin-1' == strtolower($charset))
472
    ) {
473
      $charset = 'CP1252';
474
    }
475
476
    return $this->_charset = $charset;
477
  }
478
479
  protected function read_tag($trim)
480
  {
481
    if ('<' !== $this->char) { // End Of File
482
      $this->root->_[HtmlNode::HDOM_INFO_END] = $this->cursor;
483
484
      // We might be in a nest of unclosed elements for which the end tags
485
      // can be omitted. Close them for faster seek operations.
486
      do {
487
        if (isset($this->optional_closing_tags[strtolower($this->parent->tag)])) {
488
          $this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor;
489
        }
490
      } while ($this->parent = $this->parent->parent);
491
492
      return false;
493
    }
494
495
    $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
496
497
    if ($trim) { // "<   /html>"
498
      $this->skip($this->token_blank);
499
    }
500
501
    // End tag: https://dev.w3.org/html5/pf-summary/syntax.html#end-tags
502
    if ('/' === $this->char) {
503
      $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
504
505
      $tag = $this->copy_until_char('>');
506
      $tag = $trim ? ltrim($tag, $this->token_blank) : $tag;
507
508
      // Skip attributes and whitespace in end tags
509
      if ($trim && false !== ($pos = strpos($tag, ' '))) {
510
        // phpcs:ignore Generic.Files.LineLength
511
        Debug::log_once('Source document contains superfluous whitespace in end tags (</html   >).');
512
        $tag = substr($tag, 0, $pos);
513
      }
514
515
      if (strcasecmp($this->parent->tag, $tag)) { // Parent is not start tag
516
        $parent_lower = strtolower($this->parent->tag);
517
        $tag_lower = strtolower($tag);
518
        if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) {
519
          $org_parent = $this->parent;
520
521
          // Look for the start tag
522
          while (($this->parent->parent) && strtolower($this->parent->tag) !== $tag_lower) {
523
            // Close any unclosed element with optional end tags
524
            if (isset($this->optional_closing_tags[strtolower($this->parent->tag)])) {
525
              $this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor;
526
            }
527
            $this->parent = $this->parent->parent;
528
          }
529
530
          // No start tag, close grandparent
531
          if (strtolower($this->parent->tag) !== $tag_lower) {
532
            $this->parent = $org_parent;
533
534
            if ($this->parent->parent) {
535
              $this->parent = $this->parent->parent;
536
            }
537
538
            $this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor;
539
540
            return $this->as_text_node($tag);
541
          }
542
        } elseif (($this->parent->parent) && isset($this->block_tags[$tag_lower])) {
543
          // grandparent exists + current is block tag
544
          // Parent has no end tag
545
          $this->parent->_[HtmlNode::HDOM_INFO_END] = 0;
546
          $org_parent = $this->parent;
547
548
          // Find start tag
549
          while (($this->parent->parent) && strtolower($this->parent->tag) !== $tag_lower) {
550
            $this->parent = $this->parent->parent;
551
          }
552
553
          // No start tag, close parent
554
          if (strtolower($this->parent->tag) !== $tag_lower) {
555
            $this->parent = $org_parent; // restore origonal parent
556
            $this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor;
557
558
            return $this->as_text_node($tag);
559
          }
560
        } elseif (($this->parent->parent) && strtolower($this->parent->parent->tag) === $tag_lower) {
561
          // Grandparent exists and current tag closes it
562
          $this->parent->_[HtmlNode::HDOM_INFO_END] = 0;
563
          $this->parent = $this->parent->parent;
564
        } else { // Random tag, add as text node
565
          return $this->as_text_node($tag);
566
        }
567
      }
568
569
      // Link with start tag
570
      $this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor;
571
572
      if ($this->parent->parent) {
573
        $this->parent = $this->parent->parent;
574
      }
575
576
      $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
577
      return true;
578
    }
579
580
    // Start tag: https://dev.w3.org/html5/pf-summary/syntax.html#start-tags
581
    $node = new HtmlNode($this);
582
    $node->_[HtmlNode::HDOM_INFO_BEGIN] = $this->cursor++;
583
584
    // Tag name
585
    $tag = $this->copy_until($this->token_slash);
586
587
    if (isset($tag[0]) && '!' === $tag[0]) { // Doctype, CData, Comment
588
      if (isset($tag[2]) && '-' === $tag[1] && '-' === $tag[2]) { // Comment ("<!--")
589
        // Go back until $tag only contains start of comment "!--".
590
        while (strlen($tag) > 3) {
591
          $this->char = $this->doc[--$this->pos]; // previous
592
          $tag = substr($tag, 0, strlen($tag) - 1);
593
        }
594
595
        $node->nodetype = HtmlNode::HDOM_TYPE_COMMENT;
596
        $node->tag = 'comment';
597
598
        $data = '';
599
600
        // There is a rare chance of empty comment: "<!---->"
601
        // In which case the current char is the first "-" of the end tag
602
        // But the comment could also just be a dash: "<!----->"
603
        while (true) {
604
          // Copy until first char of end tag
605
          $data .= $this->copy_until_char('-');
606
607
          // Look ahead in the document, maybe we are at the end
608
          if (($this->pos + 3) > $this->size) { // End of document
609
            Debug::log('Source document ended unexpectedly!');
610
            break;
611
          } elseif ('-->' === substr($this->doc, $this->pos, 3)) { // end
612
            $data .= $this->copy_until_char('>');
613
            break;
614
          }
615
616
          $data .= $this->char;
617
          $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
618
        }
619
620
        $tag .= $data;
621
        $tag = $this->restore_noise($tag);
622
623
        // Comment starts after "!--" and ends before "--" (5 chars total)
624
        $node->_[HtmlNode::HDOM_INFO_INNER] = substr($tag, 3, strlen($tag) - 5);
625
      } elseif ('[CDATA[' === substr($tag, 1, 7)) {
626
        // Go back until $tag only contains start of cdata "![CDATA[".
627
        while (strlen($tag) > 8) {
628
          $this->char = $this->doc[--$this->pos]; // previous
629
          $tag = substr($tag, 0, strlen($tag) - 1);
630
        }
631
632
        // CDATA can contain HTML stuff, need to find closing tags first
633
        $node->nodetype = HtmlNode::HDOM_TYPE_CDATA;
634
        $node->tag = 'cdata';
635
636
        $data = '';
637
638
        // There is a rare chance of empty CDATA: "<[CDATA[]]>"
639
        // In which case the current char is the first "[" of the end tag
640
        // But the CDATA could also just be a bracket: "<[CDATA[]]]>"
641
        while (true) {
642
          // Copy until first char of end tag
643
          $data .= $this->copy_until_char(']');
644
645
          // Look ahead in the document, maybe we are at the end
646
          if (($this->pos + 3) > $this->size) { // End of document
647
            Debug::log('Source document ended unexpectedly!');
648
            break;
649
          } elseif (']]>' === substr($this->doc, $this->pos, 3)) { // end
650
            $data .= $this->copy_until_char('>');
651
            break;
652
          }
653
654
          $data .= $this->char;
655
          $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
656
        }
657
658
        $tag .= $data;
659
        $tag = $this->restore_noise($tag);
660
661
        // CDATA starts after "![CDATA[" and ends before "]]" (10 chars total)
662
        $node->_[HtmlNode::HDOM_INFO_INNER] = substr($tag, 8, strlen($tag) - 10);
663
      } else { // Unknown
664
        Debug::log('Source document contains unknown declaration: <' . $tag);
665
        $node->nodetype = HtmlNode::HDOM_TYPE_UNKNOWN;
666
        $node->tag = 'unknown';
667
      }
668
669
      $node->_[HtmlNode::HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
670
671
      if ('>' === $this->char) {
672
        $node->_[HtmlNode::HDOM_INFO_TEXT] .= '>';
673
      }
674
675
      $this->link_nodes($node, true);
676
      $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
677
      return true;
678
    }
679
680
    if (!preg_match('/^\w[\w:-]*$/', $tag)) { // Invalid tag name
681
      $node->_[HtmlNode::HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
682
683
      if ('>' === $this->char) { // End tag
684
        $node->_[HtmlNode::HDOM_INFO_TEXT] .= '>';
685
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
686
      }
687
688
      $this->link_nodes($node, false);
689
      Debug::log('Source document contains invalid tag name: ' . $node->_[HtmlNode::HDOM_INFO_TEXT]);
690
691
      return true;
692
    }
693
694
    // Valid tag name
695
    $node->nodetype = HtmlNode::HDOM_TYPE_ELEMENT;
696
    $tag_lower = strtolower($tag);
697
    $node->tag = ($this->lowercase) ? $tag_lower : $tag;
698
699
    if (isset($this->optional_closing_tags[$tag_lower])) { // Optional closing tag
700
      while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
701
        // Previous element was the last element of ancestor
702
        $this->parent->_[HtmlNode::HDOM_INFO_END] = $node->_[HtmlNode::HDOM_INFO_BEGIN] - 1;
703
        $this->parent = $this->parent->parent;
704
      }
705
      $node->parent = $this->parent;
706
    }
707
708
    $guard = 0; // prevent infinity loop
709
710
    // [0] Space between tag and first attribute
711
    $space = [$this->copy_skip($this->token_blank), '', ''];
712
713
    do { // Parse attributes
714
      $name = $this->copy_until($this->token_equal);
715
716
      if ('' === $name && null !== $this->char && '' === $space[0]) {
717
        break;
718
      }
719
720
      if ($guard === $this->pos) { // Escape infinite loop
721
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
722
        continue;
723
      }
724
725
      $guard = $this->pos;
726
727
      if ($this->pos >= $this->size - 1 && '>' !== $this->char) { // End Of File
728
        Debug::log('Source document ended unexpectedly!');
729
        $node->nodetype = HtmlNode::HDOM_TYPE_TEXT;
730
        $node->_[HtmlNode::HDOM_INFO_END] = 0;
731
        $node->_[HtmlNode::HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
732
        $node->tag = 'text';
733
        $this->link_nodes($node, false);
734
735
        return true;
736
      }
737
738
      if ('/' === $name || '' === $name) { // No more attributes
739
        break;
740
      }
741
742
      // [1] Whitespace after attribute name
743
      $space[1] = (false === strpos($this->token_blank, $this->char)) ? '' : $this->copy_skip($this->token_blank);
744
745
      $name = $this->restore_noise($name); // might be a noisy name
746
747
      if ($this->lowercase) {
748
        $name = strtolower($name);
749
      }
750
751
      if ('=' === $this->char) { // Attribute with value
752
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
753
        $this->parse_attr($node, $name, $space, $trim); // get attribute value
754
      } else { // Attribute without value
755
        $node->_[HtmlNode::HDOM_INFO_QUOTE][$name] = HtmlNode::HDOM_QUOTE_NO;
756
        $node->attr[$name] = true;
757
        if ('>' !== $this->char) {
758
          $this->char = $this->doc[--$this->pos];
759
        } // prev
760
      }
761
762
      // Space before attribute and around equal sign
763
      if (!$trim && $space !== [' ', '', '']) {
764
        // phpcs:ignore Generic.Files.LineLength
765
        Debug::log_once('Source document contains superfluous whitespace in attributes (<e    attribute  =  "value">). Enable trimming or fix attribute spacing for best performance.');
766
        $node->_[HtmlNode::HDOM_INFO_SPACE][$name] = $space;
767
      }
768
769
      // prepare for next attribute
770
      $space = [
771
        ((false === strpos($this->token_blank, $this->char)) ? '' : $this->copy_skip($this->token_blank)),
772
        '',
773
        '',
774
      ];
775
    } while ('>' !== $this->char && '/' !== $this->char);
776
777
    $this->link_nodes($node, true);
778
779
    // Space after last attribute before closing the tag
780
    if (!$trim && '' !== $space[0]) {
781
      // phpcs:ignore Generic.Files.LineLength
782
      Debug::log_once('Source document contains superfluous whitespace before the closing braket (<e attribute="value"     >). Enable trimming or remove spaces before closing brackets for best performance.');
783
      $node->_[HtmlNode::HDOM_INFO_ENDSPACE] = $space[0];
784
    }
785
786
    $rest = ('>' === $this->char) ? '' : $this->copy_until_char('>');
787
    $rest = ($trim) ? trim($rest) : $rest; // <html   /   >
788
789
    $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
790
791
    if ('/' === trim($rest)) { // Void element
792
      if ('' !== $rest) {
793
        if (isset($node->_[HtmlNode::HDOM_INFO_ENDSPACE])) {
794
          $node->_[HtmlNode::HDOM_INFO_ENDSPACE] .= $rest;
795
        } else {
796
          $node->_[HtmlNode::HDOM_INFO_ENDSPACE] = $rest;
797
        }
798
      }
799
      $node->_[HtmlNode::HDOM_INFO_END] = 0;
800
    } elseif (!isset($this->self_closing_tags[strtolower($node->tag)])) {
801
      $innertext = $this->copy_until_char('<');
802
      if ('' !== $innertext) {
803
        $node->_[HtmlNode::HDOM_INFO_INNER] = $innertext;
804
      }
805
      $this->parent = $node;
806
    }
807
808
    if ('br' === $node->tag) {
809
      $node->_[HtmlNode::HDOM_INFO_INNER] = $this->default_br_text;
810
    } elseif ('script' === $node->tag) {
811
      $data = '';
812
813
      // There is a rare chance of empty script: "<script></script>"
814
      // In which case the current char is the start of the end tag
815
      // But the script could also just contain tags: "<script><div></script>"
816
      while (true) {
817
        // Copy until first char of end tag
818
        $data .= $this->copy_until_char('<');
819
820
        // Look ahead in the document, maybe we are at the end
821
        if (($this->pos + 9) > $this->size) { // End of document
822
          Debug::log('Source document ended unexpectedly!');
823
          break;
824
        } elseif ('</script' === substr($this->doc, $this->pos, 8)) { // end
825
          $this->skip('>'); // don't include the end tag
826
          break;
827
        }
828
829
        // Note: A script tag may contain any other tag except </script>
830
        // which needs to be escaped as <\/script>
831
832
        $data .= $this->char;
833
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
834
      }
835
836
      $node = new HtmlNode($this);
837
      ++$this->cursor;
838
      $node->_[HtmlNode::HDOM_INFO_TEXT] = $data;
839
      $this->link_nodes($node, false);
840
    }
841
842
    return true;
843
  }
844
845
  protected function parse_attr($node, $name, &$space, $trim)
846
  {
847
    $is_duplicate = isset($node->attr[$name]);
848
849
    if (!$is_duplicate) { // Copy whitespace between "=" and value
850
      $space[2] = (false === strpos($this->token_blank, $this->char)) ? '' : $this->copy_skip($this->token_blank);
851
    }
852
853
    switch ($this->char) {
854
      case '"':
855
        $quote_type = HtmlNode::HDOM_QUOTE_DOUBLE;
856
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
857
        $value = $this->copy_until_char('"');
858
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
859
        break;
860
      case '\'':
861
        // phpcs:ignore Generic.Files.LineLength
862
        Debug::log_once('Source document contains attribute values with single quotes (<e attribute=\'value\'>). Use double quotes for best performance.');
863
        $quote_type = HtmlNode::HDOM_QUOTE_SINGLE;
864
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
865
        $value = $this->copy_until_char('\'');
866
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
867
        break;
868
      default:
869
        // phpcs:ignore Generic.Files.LineLength
870
        Debug::log_once('Source document contains attribute values without quotes (<e attribute=value>). Use double quotes for best performance');
871
        $quote_type = HtmlNode::HDOM_QUOTE_NO;
872
        $value = $this->copy_until($this->token_attr);
873
    }
874
875
    $value = $this->restore_noise($value);
876
877
    if ($trim) {
878
      // Attribute values must not contain control characters other than space
879
      // https://www.w3.org/TR/html/dom.html#text-content
880
      // https://www.w3.org/TR/html/syntax.html#attribute-values
881
      // https://www.w3.org/TR/xml/#AVNormalize
882
      $value = preg_replace("/[\r\n\t\s]+/u", ' ', $value);
883
      $value = trim($value);
884
    }
885
886
    if (!$is_duplicate) {
887
      if (HtmlNode::HDOM_QUOTE_DOUBLE !== $quote_type) {
888
        $node->_[HtmlNode::HDOM_INFO_QUOTE][$name] = $quote_type;
889
      }
890
      $node->attr[$name] = $value;
891
    }
892
  }
893
894
  protected function link_nodes(&$node, $is_child)
895
  {
896
    $node->parent = $this->parent;
897
    $this->parent->nodes[] = $node;
898
    if ($is_child) {
899
      $this->parent->children[] = $node;
900
    }
901
  }
902
903
  protected function as_text_node($tag)
904
  {
905
    $node = new HtmlNode($this);
906
    ++$this->cursor;
907
    $node->_[HtmlNode::HDOM_INFO_TEXT] = '</' . $tag . '>';
908
    $this->link_nodes($node, false);
909
    $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
910
    return true;
911
  }
912
913
  protected function skip($chars)
914
  {
915
    $this->pos += strspn($this->doc, $chars, $this->pos);
916
    $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
917
  }
918
919
  protected function copy_skip($chars)
920
  {
921
    $pos = $this->pos;
922
    $len = strspn($this->doc, $chars, $pos);
923
    if (0 === $len) {
924
      return '';
925
    }
926
    $this->pos += $len;
927
    $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
928
    return substr($this->doc, $pos, $len);
929
  }
930
931
  protected function copy_until($chars)
932
  {
933
    $pos = $this->pos;
934
    $len = strcspn($this->doc, $chars, $pos);
935
    $this->pos += $len;
936
    $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
937
    return substr($this->doc, $pos, $len);
938
  }
939
940
  protected function copy_until_char($char)
941
  {
942
    if (null === $this->char) {
943
      return '';
944
    }
945
946
    if (false === ($pos = strpos($this->doc, $char, $this->pos))) {
947
      $ret = substr($this->doc, $this->pos, $this->size - $this->pos);
948
      $this->char = null;
949
      $this->pos = $this->size;
950
951
      return $ret;
952
    }
953
954
    if ($pos === $this->pos) {
955
      return '';
956
    }
957
958
    $pos_old = $this->pos;
959
    $this->char = $this->doc[$pos];
960
    $this->pos = $pos;
961
962
    return substr($this->doc, $pos_old, $pos - $pos_old);
963
  }
964
965
  protected function remove_noise($pattern, $remove_tag = false)
966
  {
967
    $count = preg_match_all(
968
      $pattern,
969
      $this->doc,
970
      $matches,
971
      PREG_SET_ORDER | PREG_OFFSET_CAPTURE
972
    );
973
974
    for ($i = $count - 1; $i > -1; --$i) {
975
      $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
976
977
      $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
978
      $this->noise[$key] = $matches[$i][$idx][0];
979
      $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
980
    }
981
982
    // reset the length of content
983
    $this->size = strlen($this->doc);
984
985
    if ($this->size > 0) {
986
      $this->char = $this->doc[0];
987
    }
988
  }
989
990
  public function restore_noise($text)
991
  {
992
    if (empty($this->noise)) {
993
      return $text;
994
    } // nothing to restore
995
    $pos = 0;
996
    while (false !== ($pos = strpos($text, '___noise___', $pos))) {
997
      // Sometimes there is a broken piece of markup, and we don't GET the
998
      // pos+11 etc... token which indicates a problem outside of us...
999
1000
      // todo: "___noise___1000" (or any number with four or more digits)
1001
      // in the DOM causes an infinite loop which could be utilized by
1002
      // malicious software
1003
      if (strlen($text) > $pos + 15) {
1004
        $key = '___noise___'
1005
          . $text[$pos + 11]
1006
          . $text[$pos + 12]
1007
          . $text[$pos + 13]
1008
          . $text[$pos + 14]
1009
          . $text[$pos + 15];
1010
1011
        if (isset($this->noise[$key])) {
1012
          $text = substr($text, 0, $pos)
1013
            . $this->noise[$key]
1014
            . substr($text, $pos + 16);
1015
1016
          unset($this->noise[$key]);
1017
        } else {
1018
          Debug::log_once('Noise restoration failed. DOM has been corrupted!');
1019
          // do this to prevent an infinite loop.
1020
          // FIXME: THis causes an infinite loop because the keyword ___NOISE___ is included in the key!
1021
          $text = substr($text, 0, $pos)
1022
            . 'UNDEFINED NOISE FOR KEY: '
1023
            . $key
1024
            . substr($text, $pos + 16);
1025
        }
1026
      } else {
1027
        // There is no valid key being given back to us... We must get
1028
        // rid of the ___noise___ or we will have a problem.
1029
        Debug::log_once('Noise restoration failed. The provided key is incomplete: ' . $text);
1030
        $text = substr($text, 0, $pos)
1031
          . 'NO NUMERIC NOISE KEY'
1032
          . substr($text, $pos + 11);
1033
      }
1034
    }
1035
1036
    return $text;
1037
  }
1038
1039
  public function search_noise($text)
1040
  {
1041
    foreach ($this->noise as $noiseElement) {
1042
      if (false !== strpos($noiseElement, $text)) {
1043
        return $noiseElement;
1044
      }
1045
    }
1046
  }
1047
1048
  public function __toString()
1049
  {
1050
    return $this->root->innertext();
1051
  }
1052
1053
  public function __get($name)
1054
  {
1055
    switch ($name) {
1056
      case 'outertext':
1057
        return $this->root->innertext();
1058
      case 'innertext':
1059
        return $this->root->innertext();
1060
      case 'plaintext':
1061
        return $this->root->text();
1062
      case 'charset':
1063
        return $this->_charset;
1064
      case 'target_charset':
1065
        return $this->_target_charset;
1066
    }
1067
  }
1068
1069
  public function childNodes($idx = -1)
1070
  {
1071
    return $this->root->childNodes($idx);
1072
  }
1073
1074
  public function firstChild()
1075
  {
1076
    return $this->root->firstChild();
1077
  }
1078
1079
  public function lastChild()
1080
  {
1081
    return $this->root->lastChild();
1082
  }
1083
1084
  public function createElement($name, $value = null)
1085
  {
1086
    $node = new HtmlNode(null);
1087
    $node->nodetype = HtmlNode::HDOM_TYPE_ELEMENT;
1088
    $node->_[HtmlNode::HDOM_INFO_BEGIN] = 1;
1089
    $node->_[HtmlNode::HDOM_INFO_END] = 1;
1090
1091
    if (null !== $value) {
1092
      $node->_[HtmlNode::HDOM_INFO_INNER] = $value;
1093
    }
1094
1095
    $node->tag = $name;
1096
1097
    return $node;
1098
  }
1099
1100
  public function createTextNode($value)
1101
  {
1102
    $node = new HtmlNode($this);
1103
    $node->nodetype = HtmlNode::HDOM_TYPE_TEXT;
1104
1105
    if (null !== $value) {
1106
      $node->_[HtmlNode::HDOM_INFO_TEXT] = $value;
1107
    }
1108
1109
    return $node;
1110
  }
1111
1112
  public function getElementById($id)
1113
  {
1114
    return $this->find("#$id", 0);
1115
  }
1116
1117
  public function getElementsById($id, $idx = null)
1118
  {
1119
    return $this->find("#$id", $idx);
1120
  }
1121
1122
  public function getElementByTagName($name)
1123
  {
1124
    return $this->find($name, 0);
1125
  }
1126
1127
  public function getElementsByTagName($name, $idx = null)
1128
  {
1129
    return $this->find($name, $idx);
1130
  }
1131
1132
  public function loadFile($file)
1133
  {
1134
    $args = func_get_args();
1135
1136
    if (false !== ($doc = call_user_func_array('file_get_contents', $args))) {
1137
      $this->load($doc, true);
1138
    } else {
1139
      return false;
1140
    }
1141
  }
1142
}
1143