HtmlDocument::__construct()   A
last analyzed

Complexity

Conditions 5
Paths 6

Size

Total Lines 33
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 16
nc 6
nop 8
dl 0
loc 33
rs 9.4222
c 0
b 0
f 0

How to fix   Many Parameters   

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
namespace simplehtmldom;
4
5
/**
6
 * Website: http://sourceforge.net/projects/simplehtmldom/
7
 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/).
8
 *
9
 * Licensed under The MIT License
10
 * See the LICENSE file in the project root for more information.
11
 *
12
 * Authors:
13
 *   S.C. Chen
14
 *   John Schlick
15
 *   Rus Carroll
16
 *   logmanoriginal
17
 *
18
 * Contributors:
19
 *   Yousuke Kumakura
20
 *   Vadim Voituk
21
 *   Antcs
22
 *
23
 * Version Rev. 2.0-RC2 (415)
24
 */
25
include_once __DIR__ . '/constants.php';
26
include_once __DIR__ . '/HtmlNode.php';
27
include_once __DIR__ . '/Debug.php';
28
29
/**
30
 * HTMLDocument class.
31
 */
32
class HtmlDocument
33
{
34
  /**
35
   * HtmlNode instance.
36
   *
37
   * @var HtmlNode
38
   */
39
  public $root = null;
40
  public $nodes = [];
41
  public $callback = null;
42
  public $lowercase = false;
43
  public $original_size;
44
  public $size;
45
46
  protected $pos;
47
  protected $doc;
48
  protected $char;
49
50
  protected $cursor;
51
  protected $parent;
52
  protected $noise = [];
53
  protected $token_blank = " \t\r\n";
54
  protected $token_equal = ' =/>';
55
  protected $token_slash = " />\r\n\t";
56
  protected $token_attr = ' >';
57
58
  public $_charset = '';
59
  public $_target_charset = '';
60
61
  public $default_br_text = '';
62
  public $default_span_text = '';
63
64
  protected $self_closing_tags = [
65
    'area' => 1,
66
    'base' => 1,
67
    'br' => 1,
68
    'col' => 1,
69
    'embed' => 1,
70
    'hr' => 1,
71
    'img' => 1,
72
    'input' => 1,
73
    'link' => 1,
74
    'meta' => 1,
75
    'param' => 1,
76
    'source' => 1,
77
    'track' => 1,
78
    'wbr' => 1,
79
  ];
80
  protected $block_tags = [
81
    'body' => 1,
82
    'div' => 1,
83
    'form' => 1,
84
    'root' => 1,
85
    'span' => 1,
86
    'table' => 1,
87
  ];
88
  protected $optional_closing_tags = [
89
    // Not optional, see
90
    // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
91
    'b' => ['b' => 1],
92
    'dd' => ['dd' => 1, 'dt' => 1],
93
    // Not optional, see
94
    // https://www.w3.org/TR/html/grouping-content.html#the-dl-element
95
    'dl' => ['dd' => 1, 'dt' => 1],
96
    'dt' => ['dd' => 1, 'dt' => 1],
97
    'li' => ['li' => 1],
98
    'optgroup' => ['optgroup' => 1, 'option' => 1],
99
    'option' => ['optgroup' => 1, 'option' => 1],
100
    'p' => ['p' => 1],
101
    'rp' => ['rp' => 1, 'rt' => 1],
102
    'rt' => ['rp' => 1, 'rt' => 1],
103
    'td' => ['td' => 1, 'th' => 1],
104
    'th' => ['td' => 1, 'th' => 1],
105
    'tr' => ['td' => 1, 'th' => 1, 'tr' => 1],
106
  ];
107
108
  public function __call($func, $args)
109
  {
110
    // Allow users to call methods with lower_case syntax
111
    switch ($func) {
112
      case 'load_file':
113
        $actual_function = 'loadFile';
114
        break;
115
      case 'clear':
116
        return; /* no-op */
117
      default:
118
        trigger_error(
119
          'Call to undefined method ' . __CLASS__ . '::' . $func . '()',
120
          E_USER_ERROR
121
        );
122
    }
123
124
    // phpcs:ignore Generic.Files.LineLength
125
    Debug::log(__CLASS__ . '->' . $func . '() has been deprecated and will be removed in the next major version of simplehtmldom. Use ' . __CLASS__ . '->' . $actual_function . '() instead.');
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $actual_function does not seem to be defined for all execution paths leading up to this point.
Loading history...
126
127
    return call_user_func_array([$this, $actual_function], $args);
128
  }
129
130
  public function __construct(
131
    $str = null,
132
    $lowercase = true,
133
    $forceTagsClosed = true,
134
    $target_charset = DEFAULT_TARGET_CHARSET,
135
    $stripRN = true,
136
    $defaultBRText = DEFAULT_BR_TEXT,
137
    $defaultSpanText = DEFAULT_SPAN_TEXT,
138
    $options = 0
139
  ) {
140
    if ($str) {
141
      if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
142
        $this->load_file($str);
0 ignored issues
show
Bug introduced by
The method load_file() does not exist on simplehtmldom\HtmlDocument. Since you implemented __call, consider adding a @method annotation. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

142
        $this->/** @scrutinizer ignore-call */ 
143
               load_file($str);
Loading history...
143
      } else {
144
        $this->load(
145
          $str,
146
          $lowercase,
147
          $stripRN,
148
          $defaultBRText,
149
          $defaultSpanText,
150
          $options
151
        );
152
      }
153
    } else {
154
      $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
155
    }
156
    // Forcing tags to be closed implies that we don't trust the html, but
157
    // it can lead to parsing errors if we SHOULD trust the html.
158
    if (!$forceTagsClosed) {
159
      $this->optional_closing_array = [];
0 ignored issues
show
Bug Best Practice introduced by
The property optional_closing_array does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
160
    }
161
162
    $this->_target_charset = $target_charset;
163
  }
164
165
  public function __debugInfo()
166
  {
167
    return [
168
      'root' => $this->root,
169
      'noise' => empty($this->noise) ? 'none' : $this->noise,
170
      'charset' => $this->_charset,
171
      'target charset' => $this->_target_charset,
172
      'original size' => $this->original_size,
173
    ];
174
  }
175
176
  public function __destruct()
177
  {
178
    if (isset($this->nodes)) {
179
      foreach ($this->nodes as $n) {
180
        $n->clear();
181
      }
182
    }
183
  }
184
185
  public function load(
186
    $str,
187
    $lowercase = true,
188
    $stripRN = true,
189
    $defaultBRText = DEFAULT_BR_TEXT,
190
    $defaultSpanText = DEFAULT_SPAN_TEXT,
191
    $options = 0
192
  ) {
193
    // prepare
194
    $this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
195
196
    if ($stripRN) {
197
      // Temporarily remove any element that shouldn't loose whitespace
198
      $this->remove_noise("'<\s*script[^>]*>(.*?)<\s*/\s*script\s*>'is");
199
      $this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is");
200
      $this->remove_noise("'<!--(.*?)-->'is");
201
      $this->remove_noise("'<\s*style[^>]*>(.*?)<\s*/\s*style\s*>'is");
202
      $this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
203
204
      // Remove whitespace and newlines between tags
205
      $this->doc = preg_replace('/\>([\t\s]*[\r\n]^[\t\s]*)\</m', '><', $this->doc);
206
207
      // Remove whitespace and newlines in text
208
      $this->doc = preg_replace('/([\t\s]*[\r\n]^[\t\s]*)/m', ' ', $this->doc);
209
210
      // Restore temporarily removed elements and calculate new size
211
      $this->doc = $this->restore_noise($this->doc);
212
      $this->size = strlen($this->doc);
213
    }
214
215
    $this->remove_noise("'(<\?)(.*?)(\?>)'s", true); // server-side script
216
    if (count($this->noise)) {
217
      // phpcs:ignore Generic.Files.LineLength
218
      Debug::log('Support for server-side scripts has been deprecated and will be removed in the next major version of simplehtmldom.');
219
    }
220
221
    if ($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
222
      $this->remove_noise("'(\{\w)(.*?)(\})'s", true);
223
      // phpcs:ignore Generic.Files.LineLength
224
      Debug::log('Support for Smarty scripts has been deprecated and will be removed in the next major version of simplehtmldom.');
225
    }
226
227
    // parsing
228
    $this->parse($stripRN);
229
    // end
230
    $this->root->_[HtmlNode::HDOM_INFO_END] = $this->cursor;
231
    $this->parse_charset();
232
    $this->decode();
233
    unset($this->doc);
234
235
    // make load function chainable
236
    return $this;
237
  }
238
239
  public function set_callback($function_name)
240
  {
241
    $this->callback = $function_name;
242
  }
243
244
  public function remove_callback()
245
  {
246
    $this->callback = null;
247
  }
248
249
  /**
250
   * Save modified html.
251
   *
252
   * @param string $filepath
253
   */
254
  public function save($filepath = '')
255
  {
256
    $ret = $this->root->innertext();
257
    if ('' !== $filepath) {
258
      file_put_contents($filepath, $ret, LOCK_EX);
259
    }
260
261
    return $ret;
262
  }
263
264
  /**
265
   * Find elements by CSS Selector.
266
   *
267
   * @param string      $selector  CSS Selector
268
   * @param number|null $idx
269
   * @param bool        $lowercase
270
   *
271
   * @return HtmlNode[]|HtmlNode
272
   */
273
  public function find($selector, $idx = null, $lowercase = false)
274
  {
275
    return $this->root->find($selector, $idx, $lowercase);
0 ignored issues
show
Bug introduced by
It seems like $idx can also be of type double; however, parameter $idx of simplehtmldom\HtmlNode::find() does only seem to accept integer, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

275
    return $this->root->find($selector, /** @scrutinizer ignore-type */ $idx, $lowercase);
Loading history...
276
  }
277
278
  public function title()
279
  {
280
    $title = $this->find('title', 0);
281
    return $title ? $title->innertext : null;
0 ignored issues
show
introduced by
$title is of type simplehtmldom\HtmlNode, thus it always evaluated to true.
Loading history...
282
  }
283
284
  public function expect($selector, $idx = null, $lowercase = false)
285
  {
286
    return $this->root->expect($selector, $idx, $lowercase);
287
  }
288
289
  /** @codeCoverageIgnore */
290
  public function dump($show_attr = true)
291
  {
292
    $this->root->dump($show_attr);
293
  }
294
295
  protected function prepare(
296
    $str,
297
    $lowercase = true,
298
    $defaultBRText = DEFAULT_BR_TEXT,
299
    $defaultSpanText = DEFAULT_SPAN_TEXT
300
  ) {
301
    $this->clear();
0 ignored issues
show
Bug introduced by
The method clear() does not exist on simplehtmldom\HtmlDocument. Since you implemented __call, consider adding a @method annotation. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

301
    $this->/** @scrutinizer ignore-call */ 
302
           clear();
Loading history...
302
303
    $this->doc = trim($str);
304
    $this->size = strlen($this->doc);
305
    $this->original_size = $this->size; // original size of the html
306
    $this->pos = 0;
307
    $this->cursor = 1;
308
    $this->noise = [];
309
    $this->nodes = [];
310
    $this->lowercase = $lowercase;
311
    $this->default_br_text = $defaultBRText;
312
    $this->default_span_text = $defaultSpanText;
313
    $this->root = new HtmlNode($this);
314
    $this->root->tag = 'root';
315
    $this->root->_[HtmlNode::HDOM_INFO_BEGIN] = -1;
316
    $this->root->nodetype = HtmlNode::HDOM_TYPE_ROOT;
317
    $this->parent = $this->root;
318
    if ($this->size > 0) {
319
      $this->char = $this->doc[0];
320
    }
321
  }
322
323
  protected function decode()
324
  {
325
    foreach ($this->nodes as $node) {
326
      if (isset($node->_[HtmlNode::HDOM_INFO_TEXT])) {
327
        $node->_[HtmlNode::HDOM_INFO_TEXT] = html_entity_decode(
328
          $this->restore_noise($node->_[HtmlNode::HDOM_INFO_TEXT]),
329
          ENT_QUOTES | ENT_HTML5,
330
          $this->_target_charset
331
        );
332
      }
333
      if (isset($node->_[HtmlNode::HDOM_INFO_INNER])) {
334
        $node->_[HtmlNode::HDOM_INFO_INNER] = html_entity_decode(
335
          $this->restore_noise($node->_[HtmlNode::HDOM_INFO_INNER]),
336
          ENT_QUOTES | ENT_HTML5,
337
          $this->_target_charset
338
        );
339
      }
340
      if (isset($node->attr) && is_array($node->attr)) {
341
        foreach ($node->attr as $a => $v) {
342
          if (true === $v) {
343
            continue;
344
          }
345
          $node->attr[$a] = html_entity_decode(
346
            $v,
347
            ENT_QUOTES | ENT_HTML5,
348
            $this->_target_charset
349
          );
350
        }
351
      }
352
    }
353
  }
354
355
  protected function parse($trim = false)
356
  {
357
    while (true) {
358
      if ('<' !== $this->char) {
359
        $content = $this->copy_until_char('<');
360
361
        if ('' !== $content) {
362
          // Skip whitespace between tags? (</a> <b>)
363
          if ($trim && '' === trim($content)) {
364
            continue;
365
          }
366
367
          $node = new HtmlNode($this);
368
          ++$this->cursor;
369
          $node->_[HtmlNode::HDOM_INFO_TEXT] = $content;
370
          $this->link_nodes($node, false);
371
        }
372
      }
373
374
      if (false === $this->read_tag($trim)) {
375
        break;
376
      }
377
    }
378
  }
379
380
  protected function parse_charset()
381
  {
382
    $charset = null;
383
384
    if (function_exists('get_last_retrieve_url_contents_content_type')) {
385
      $contentTypeHeader = call_user_func('get_last_retrieve_url_contents_content_type');
386
      $success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
387
      if ($success) {
388
        $charset = $matches[1];
389
      }
390
391
      // phpcs:ignore Generic.Files.LineLength
392
      Debug::log('Determining charset using get_last_retrieve_url_contents_content_type() ' . ($success ? 'successful' : 'failed'));
393
    }
394
395
    if (empty($charset)) {
396
      // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type
397
      $el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
398
399
      if (!empty($el)) {
400
        $fullvalue = $el->content;
0 ignored issues
show
Bug Best Practice introduced by
The property content does not exist on simplehtmldom\HtmlNode. Since you implemented __get, consider adding a @property annotation.
Loading history...
401
402
        if (!empty($fullvalue)) {
403
          $success = preg_match(
404
            '/charset=(.+)/i',
405
            $fullvalue,
406
            $matches
407
          );
408
409
          if ($success) {
410
            $charset = $matches[1];
411
          }
412
        }
413
      }
414
    }
415
416
    if (empty($charset)) {
417
      // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration
418
      if ($meta = $this->root->find('meta[charset]', 0)) {
419
        $charset = $meta->charset;
0 ignored issues
show
Bug Best Practice introduced by
The property charset does not exist on simplehtmldom\HtmlNode. Since you implemented __get, consider adding a @property annotation.
Loading history...
420
      }
421
    }
422
423
    if (empty($charset)) {
424
      // Try to guess the charset based on the content
425
      // Requires Multibyte String (mbstring) support (optional)
426
      if (function_exists('mb_detect_encoding')) {
427
        /**
428
         * mb_detect_encoding() is not intended to distinguish between
429
         * charsets, especially single-byte charsets. Its primary
430
         * purpose is to detect which multibyte encoding is in use,
431
         * i.e. UTF-8, UTF-16, shift-JIS, etc.
432
         *
433
         * -- https://bugs.php.net/bug.php?id=38138
434
         *
435
         * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will
436
         * always result in CP1251/ISO-8859-5 and vice versa.
437
         *
438
         * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1
439
         * to stay compatible.
440
         */
441
        $encoding = mb_detect_encoding(
442
          $this->doc,
443
          ['UTF-8', 'CP1252', 'ISO-8859-1']
444
        );
445
446
        if ('CP1252' === $encoding || 'ISO-8859-1' === $encoding) {
447
          // Due to a limitation of mb_detect_encoding
448
          // 'CP1251'/'ISO-8859-5' will be detected as
449
          // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in
450
          // which case we can simply assume it is the other charset.
451
          if (!@iconv('CP1252', 'UTF-8', $this->doc)) {
452
            $encoding = 'CP1251';
453
          }
454
        }
455
456
        if (false !== $encoding) {
0 ignored issues
show
introduced by
The condition false !== $encoding is always true.
Loading history...
457
          $charset = $encoding;
458
        }
459
      }
460
    }
461
462
    if (empty($charset)) {
463
      Debug::log('Unable to determine charset from source document. Assuming UTF-8');
464
      $charset = 'UTF-8';
465
    }
466
467
    // Since CP1252 is a superset, if we get one of it's subsets, we want
468
    // it instead.
469
    if (('iso-8859-1' == strtolower($charset))
470
      || ('latin1' == strtolower($charset))
471
      || ('latin-1' == strtolower($charset))
472
    ) {
473
      $charset = 'CP1252';
474
    }
475
476
    return $this->_charset = $charset;
477
  }
478
479
  protected function read_tag($trim)
480
  {
481
    if ('<' !== $this->char) { // End Of File
482
      $this->root->_[HtmlNode::HDOM_INFO_END] = $this->cursor;
483
484
      // We might be in a nest of unclosed elements for which the end tags
485
      // can be omitted. Close them for faster seek operations.
486
      do {
487
        if (isset($this->optional_closing_tags[strtolower($this->parent->tag)])) {
488
          $this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor;
489
        }
490
      } while ($this->parent = $this->parent->parent);
491
492
      return false;
493
    }
494
495
    $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
496
497
    if ($trim) { // "<   /html>"
498
      $this->skip($this->token_blank);
499
    }
500
501
    // End tag: https://dev.w3.org/html5/pf-summary/syntax.html#end-tags
502
    if ('/' === $this->char) {
503
      $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
504
505
      $tag = $this->copy_until_char('>');
506
      $tag = $trim ? ltrim($tag, $this->token_blank) : $tag;
507
508
      // Skip attributes and whitespace in end tags
509
      if ($trim && false !== ($pos = strpos($tag, ' '))) {
510
        // phpcs:ignore Generic.Files.LineLength
511
        Debug::log_once('Source document contains superfluous whitespace in end tags (</html   >).');
512
        $tag = substr($tag, 0, $pos);
513
      }
514
515
      if (strcasecmp($this->parent->tag, $tag)) { // Parent is not start tag
516
        $parent_lower = strtolower($this->parent->tag);
517
        $tag_lower = strtolower($tag);
518
        if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) {
519
          $org_parent = $this->parent;
520
521
          // Look for the start tag
522
          while (($this->parent->parent) && strtolower($this->parent->tag) !== $tag_lower) {
523
            // Close any unclosed element with optional end tags
524
            if (isset($this->optional_closing_tags[strtolower($this->parent->tag)])) {
525
              $this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor;
526
            }
527
            $this->parent = $this->parent->parent;
528
          }
529
530
          // No start tag, close grandparent
531
          if (strtolower($this->parent->tag) !== $tag_lower) {
532
            $this->parent = $org_parent;
533
534
            if ($this->parent->parent) {
535
              $this->parent = $this->parent->parent;
536
            }
537
538
            $this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor;
539
540
            return $this->as_text_node($tag);
541
          }
542
        } elseif (($this->parent->parent) && isset($this->block_tags[$tag_lower])) {
543
          // grandparent exists + current is block tag
544
          // Parent has no end tag
545
          $this->parent->_[HtmlNode::HDOM_INFO_END] = 0;
546
          $org_parent = $this->parent;
547
548
          // Find start tag
549
          while (($this->parent->parent) && strtolower($this->parent->tag) !== $tag_lower) {
550
            $this->parent = $this->parent->parent;
551
          }
552
553
          // No start tag, close parent
554
          if (strtolower($this->parent->tag) !== $tag_lower) {
555
            $this->parent = $org_parent; // restore origonal parent
556
            $this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor;
557
558
            return $this->as_text_node($tag);
559
          }
560
        } elseif (($this->parent->parent) && strtolower($this->parent->parent->tag) === $tag_lower) {
561
          // Grandparent exists and current tag closes it
562
          $this->parent->_[HtmlNode::HDOM_INFO_END] = 0;
563
          $this->parent = $this->parent->parent;
564
        } else { // Random tag, add as text node
565
          return $this->as_text_node($tag);
566
        }
567
      }
568
569
      // Link with start tag
570
      $this->parent->_[HtmlNode::HDOM_INFO_END] = $this->cursor;
571
572
      if ($this->parent->parent) {
573
        $this->parent = $this->parent->parent;
574
      }
575
576
      $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
577
      return true;
578
    }
579
580
    // Start tag: https://dev.w3.org/html5/pf-summary/syntax.html#start-tags
581
    $node = new HtmlNode($this);
582
    $node->_[HtmlNode::HDOM_INFO_BEGIN] = $this->cursor++;
583
584
    // Tag name
585
    $tag = $this->copy_until($this->token_slash);
586
587
    if (isset($tag[0]) && '!' === $tag[0]) { // Doctype, CData, Comment
588
      if (isset($tag[2]) && '-' === $tag[1] && '-' === $tag[2]) { // Comment ("<!--")
589
        // Go back until $tag only contains start of comment "!--".
590
        while (strlen($tag) > 3) {
591
          $this->char = $this->doc[--$this->pos]; // previous
592
          $tag = substr($tag, 0, strlen($tag) - 1);
593
        }
594
595
        $node->nodetype = HtmlNode::HDOM_TYPE_COMMENT;
596
        $node->tag = 'comment';
597
598
        $data = '';
599
600
        // There is a rare chance of empty comment: "<!---->"
601
        // In which case the current char is the first "-" of the end tag
602
        // But the comment could also just be a dash: "<!----->"
603
        while (true) {
604
          // Copy until first char of end tag
605
          $data .= $this->copy_until_char('-');
606
607
          // Look ahead in the document, maybe we are at the end
608
          if (($this->pos + 3) > $this->size) { // End of document
609
            Debug::log('Source document ended unexpectedly!');
610
            break;
611
          } elseif ('-->' === substr($this->doc, $this->pos, 3)) { // end
612
            $data .= $this->copy_until_char('>');
613
            break;
614
          }
615
616
          $data .= $this->char;
617
          $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
618
        }
619
620
        $tag .= $data;
621
        $tag = $this->restore_noise($tag);
622
623
        // Comment starts after "!--" and ends before "--" (5 chars total)
624
        $node->_[HtmlNode::HDOM_INFO_INNER] = substr($tag, 3, strlen($tag) - 5);
625
      } elseif ('[CDATA[' === substr($tag, 1, 7)) {
626
        // Go back until $tag only contains start of cdata "![CDATA[".
627
        while (strlen($tag) > 8) {
628
          $this->char = $this->doc[--$this->pos]; // previous
629
          $tag = substr($tag, 0, strlen($tag) - 1);
630
        }
631
632
        // CDATA can contain HTML stuff, need to find closing tags first
633
        $node->nodetype = HtmlNode::HDOM_TYPE_CDATA;
634
        $node->tag = 'cdata';
635
636
        $data = '';
637
638
        // There is a rare chance of empty CDATA: "<[CDATA[]]>"
639
        // In which case the current char is the first "[" of the end tag
640
        // But the CDATA could also just be a bracket: "<[CDATA[]]]>"
641
        while (true) {
642
          // Copy until first char of end tag
643
          $data .= $this->copy_until_char(']');
644
645
          // Look ahead in the document, maybe we are at the end
646
          if (($this->pos + 3) > $this->size) { // End of document
647
            Debug::log('Source document ended unexpectedly!');
648
            break;
649
          } elseif (']]>' === substr($this->doc, $this->pos, 3)) { // end
650
            $data .= $this->copy_until_char('>');
651
            break;
652
          }
653
654
          $data .= $this->char;
655
          $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
656
        }
657
658
        $tag .= $data;
659
        $tag = $this->restore_noise($tag);
660
661
        // CDATA starts after "![CDATA[" and ends before "]]" (10 chars total)
662
        $node->_[HtmlNode::HDOM_INFO_INNER] = substr($tag, 8, strlen($tag) - 10);
663
      } else { // Unknown
664
        Debug::log('Source document contains unknown declaration: <' . $tag);
665
        $node->nodetype = HtmlNode::HDOM_TYPE_UNKNOWN;
666
        $node->tag = 'unknown';
667
      }
668
669
      $node->_[HtmlNode::HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
670
671
      if ('>' === $this->char) {
672
        $node->_[HtmlNode::HDOM_INFO_TEXT] .= '>';
673
      }
674
675
      $this->link_nodes($node, true);
676
      $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
677
      return true;
678
    }
679
680
    if (!preg_match('/^\w[\w:-]*$/', $tag)) { // Invalid tag name
681
      $node->_[HtmlNode::HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
682
683
      if ('>' === $this->char) { // End tag
684
        $node->_[HtmlNode::HDOM_INFO_TEXT] .= '>';
685
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
686
      }
687
688
      $this->link_nodes($node, false);
689
      Debug::log('Source document contains invalid tag name: ' . $node->_[HtmlNode::HDOM_INFO_TEXT]);
690
691
      return true;
692
    }
693
694
    // Valid tag name
695
    $node->nodetype = HtmlNode::HDOM_TYPE_ELEMENT;
696
    $tag_lower = strtolower($tag);
697
    $node->tag = ($this->lowercase) ? $tag_lower : $tag;
698
699
    if (isset($this->optional_closing_tags[$tag_lower])) { // Optional closing tag
700
      while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
701
        // Previous element was the last element of ancestor
702
        $this->parent->_[HtmlNode::HDOM_INFO_END] = $node->_[HtmlNode::HDOM_INFO_BEGIN] - 1;
703
        $this->parent = $this->parent->parent;
704
      }
705
      $node->parent = $this->parent;
706
    }
707
708
    $guard = 0; // prevent infinity loop
709
710
    // [0] Space between tag and first attribute
711
    $space = [$this->copy_skip($this->token_blank), '', ''];
712
713
    do { // Parse attributes
714
      $name = $this->copy_until($this->token_equal);
715
716
      if ('' === $name && null !== $this->char && '' === $space[0]) {
717
        break;
718
      }
719
720
      if ($guard === $this->pos) { // Escape infinite loop
721
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
722
        continue;
723
      }
724
725
      $guard = $this->pos;
726
727
      if ($this->pos >= $this->size - 1 && '>' !== $this->char) { // End Of File
728
        Debug::log('Source document ended unexpectedly!');
729
        $node->nodetype = HtmlNode::HDOM_TYPE_TEXT;
730
        $node->_[HtmlNode::HDOM_INFO_END] = 0;
731
        $node->_[HtmlNode::HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
732
        $node->tag = 'text';
733
        $this->link_nodes($node, false);
734
735
        return true;
736
      }
737
738
      if ('/' === $name || '' === $name) { // No more attributes
739
        break;
740
      }
741
742
      // [1] Whitespace after attribute name
743
      $space[1] = (false === strpos($this->token_blank, $this->char)) ? '' : $this->copy_skip($this->token_blank);
744
745
      $name = $this->restore_noise($name); // might be a noisy name
746
747
      if ($this->lowercase) {
748
        $name = strtolower($name);
749
      }
750
751
      if ('=' === $this->char) { // Attribute with value
752
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
753
        $this->parse_attr($node, $name, $space, $trim); // get attribute value
754
      } else { // Attribute without value
755
        $node->_[HtmlNode::HDOM_INFO_QUOTE][$name] = HtmlNode::HDOM_QUOTE_NO;
756
        $node->attr[$name] = true;
757
        if ('>' !== $this->char) {
758
          $this->char = $this->doc[--$this->pos];
759
        } // prev
760
      }
761
762
      // Space before attribute and around equal sign
763
      if (!$trim && $space !== [' ', '', '']) {
764
        // phpcs:ignore Generic.Files.LineLength
765
        Debug::log_once('Source document contains superfluous whitespace in attributes (<e    attribute  =  "value">). Enable trimming or fix attribute spacing for best performance.');
766
        $node->_[HtmlNode::HDOM_INFO_SPACE][$name] = $space;
767
      }
768
769
      // prepare for next attribute
770
      $space = [
771
        ((false === strpos($this->token_blank, $this->char)) ? '' : $this->copy_skip($this->token_blank)),
772
        '',
773
        '',
774
      ];
775
    } while ('>' !== $this->char && '/' !== $this->char);
776
777
    $this->link_nodes($node, true);
778
779
    // Space after last attribute before closing the tag
780
    if (!$trim && '' !== $space[0]) {
781
      // phpcs:ignore Generic.Files.LineLength
782
      Debug::log_once('Source document contains superfluous whitespace before the closing braket (<e attribute="value"     >). Enable trimming or remove spaces before closing brackets for best performance.');
783
      $node->_[HtmlNode::HDOM_INFO_ENDSPACE] = $space[0];
784
    }
785
786
    $rest = ('>' === $this->char) ? '' : $this->copy_until_char('>');
787
    $rest = ($trim) ? trim($rest) : $rest; // <html   /   >
788
789
    $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
790
791
    if ('/' === trim($rest)) { // Void element
792
      if ('' !== $rest) {
793
        if (isset($node->_[HtmlNode::HDOM_INFO_ENDSPACE])) {
794
          $node->_[HtmlNode::HDOM_INFO_ENDSPACE] .= $rest;
795
        } else {
796
          $node->_[HtmlNode::HDOM_INFO_ENDSPACE] = $rest;
797
        }
798
      }
799
      $node->_[HtmlNode::HDOM_INFO_END] = 0;
800
    } elseif (!isset($this->self_closing_tags[strtolower($node->tag)])) {
801
      $innertext = $this->copy_until_char('<');
802
      if ('' !== $innertext) {
803
        $node->_[HtmlNode::HDOM_INFO_INNER] = $innertext;
804
      }
805
      $this->parent = $node;
806
    }
807
808
    if ('br' === $node->tag) {
809
      $node->_[HtmlNode::HDOM_INFO_INNER] = $this->default_br_text;
810
    } elseif ('script' === $node->tag) {
811
      $data = '';
812
813
      // There is a rare chance of empty script: "<script></script>"
814
      // In which case the current char is the start of the end tag
815
      // But the script could also just contain tags: "<script><div></script>"
816
      while (true) {
817
        // Copy until first char of end tag
818
        $data .= $this->copy_until_char('<');
819
820
        // Look ahead in the document, maybe we are at the end
821
        if (($this->pos + 9) > $this->size) { // End of document
822
          Debug::log('Source document ended unexpectedly!');
823
          break;
824
        } elseif ('</script' === substr($this->doc, $this->pos, 8)) { // end
825
          $this->skip('>'); // don't include the end tag
826
          break;
827
        }
828
829
        // Note: A script tag may contain any other tag except </script>
830
        // which needs to be escaped as <\/script>
831
832
        $data .= $this->char;
833
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
834
      }
835
836
      $node = new HtmlNode($this);
837
      ++$this->cursor;
838
      $node->_[HtmlNode::HDOM_INFO_TEXT] = $data;
839
      $this->link_nodes($node, false);
840
    }
841
842
    return true;
843
  }
844
845
  protected function parse_attr($node, $name, &$space, $trim)
846
  {
847
    $is_duplicate = isset($node->attr[$name]);
848
849
    if (!$is_duplicate) { // Copy whitespace between "=" and value
850
      $space[2] = (false === strpos($this->token_blank, $this->char)) ? '' : $this->copy_skip($this->token_blank);
851
    }
852
853
    switch ($this->char) {
854
      case '"':
855
        $quote_type = HtmlNode::HDOM_QUOTE_DOUBLE;
856
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
857
        $value = $this->copy_until_char('"');
858
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
859
        break;
860
      case '\'':
861
        // phpcs:ignore Generic.Files.LineLength
862
        Debug::log_once('Source document contains attribute values with single quotes (<e attribute=\'value\'>). Use double quotes for best performance.');
863
        $quote_type = HtmlNode::HDOM_QUOTE_SINGLE;
864
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
865
        $value = $this->copy_until_char('\'');
866
        $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
867
        break;
868
      default:
869
        // phpcs:ignore Generic.Files.LineLength
870
        Debug::log_once('Source document contains attribute values without quotes (<e attribute=value>). Use double quotes for best performance');
871
        $quote_type = HtmlNode::HDOM_QUOTE_NO;
872
        $value = $this->copy_until($this->token_attr);
873
    }
874
875
    $value = $this->restore_noise($value);
876
877
    if ($trim) {
878
      // Attribute values must not contain control characters other than space
879
      // https://www.w3.org/TR/html/dom.html#text-content
880
      // https://www.w3.org/TR/html/syntax.html#attribute-values
881
      // https://www.w3.org/TR/xml/#AVNormalize
882
      $value = preg_replace("/[\r\n\t\s]+/u", ' ', $value);
883
      $value = trim($value);
884
    }
885
886
    if (!$is_duplicate) {
887
      if (HtmlNode::HDOM_QUOTE_DOUBLE !== $quote_type) {
888
        $node->_[HtmlNode::HDOM_INFO_QUOTE][$name] = $quote_type;
889
      }
890
      $node->attr[$name] = $value;
891
    }
892
  }
893
894
  protected function link_nodes(&$node, $is_child)
895
  {
896
    $node->parent = $this->parent;
897
    $this->parent->nodes[] = $node;
898
    if ($is_child) {
899
      $this->parent->children[] = $node;
900
    }
901
  }
902
903
  protected function as_text_node($tag)
904
  {
905
    $node = new HtmlNode($this);
906
    ++$this->cursor;
907
    $node->_[HtmlNode::HDOM_INFO_TEXT] = '</' . $tag . '>';
908
    $this->link_nodes($node, false);
909
    $this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
910
    return true;
911
  }
912
913
  protected function skip($chars)
914
  {
915
    $this->pos += strspn($this->doc, $chars, $this->pos);
916
    $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
917
  }
918
919
  protected function copy_skip($chars)
920
  {
921
    $pos = $this->pos;
922
    $len = strspn($this->doc, $chars, $pos);
923
    if (0 === $len) {
924
      return '';
925
    }
926
    $this->pos += $len;
927
    $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
928
    return substr($this->doc, $pos, $len);
929
  }
930
931
  protected function copy_until($chars)
932
  {
933
    $pos = $this->pos;
934
    $len = strcspn($this->doc, $chars, $pos);
935
    $this->pos += $len;
936
    $this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
937
    return substr($this->doc, $pos, $len);
938
  }
939
940
  protected function copy_until_char($char)
941
  {
942
    if (null === $this->char) {
943
      return '';
944
    }
945
946
    if (false === ($pos = strpos($this->doc, $char, $this->pos))) {
947
      $ret = substr($this->doc, $this->pos, $this->size - $this->pos);
948
      $this->char = null;
949
      $this->pos = $this->size;
950
951
      return $ret;
952
    }
953
954
    if ($pos === $this->pos) {
955
      return '';
956
    }
957
958
    $pos_old = $this->pos;
959
    $this->char = $this->doc[$pos];
960
    $this->pos = $pos;
961
962
    return substr($this->doc, $pos_old, $pos - $pos_old);
963
  }
964
965
  protected function remove_noise($pattern, $remove_tag = false)
966
  {
967
    $count = preg_match_all(
968
      $pattern,
969
      $this->doc,
970
      $matches,
971
      PREG_SET_ORDER | PREG_OFFSET_CAPTURE
972
    );
973
974
    for ($i = $count - 1; $i > -1; --$i) {
975
      $key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
976
977
      $idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
978
      $this->noise[$key] = $matches[$i][$idx][0];
979
      $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
980
    }
981
982
    // reset the length of content
983
    $this->size = strlen($this->doc);
984
985
    if ($this->size > 0) {
986
      $this->char = $this->doc[0];
987
    }
988
  }
989
990
  public function restore_noise($text)
991
  {
992
    if (empty($this->noise)) {
993
      return $text;
994
    } // nothing to restore
995
    $pos = 0;
996
    while (false !== ($pos = strpos($text, '___noise___', $pos))) {
997
      // Sometimes there is a broken piece of markup, and we don't GET the
998
      // pos+11 etc... token which indicates a problem outside of us...
999
1000
      // todo: "___noise___1000" (or any number with four or more digits)
1001
      // in the DOM causes an infinite loop which could be utilized by
1002
      // malicious software
1003
      if (strlen($text) > $pos + 15) {
1004
        $key = '___noise___'
1005
          . $text[$pos + 11]
1006
          . $text[$pos + 12]
1007
          . $text[$pos + 13]
1008
          . $text[$pos + 14]
1009
          . $text[$pos + 15];
1010
1011
        if (isset($this->noise[$key])) {
1012
          $text = substr($text, 0, $pos)
1013
            . $this->noise[$key]
1014
            . substr($text, $pos + 16);
1015
1016
          unset($this->noise[$key]);
1017
        } else {
1018
          Debug::log_once('Noise restoration failed. DOM has been corrupted!');
1019
          // do this to prevent an infinite loop.
1020
          // FIXME: THis causes an infinite loop because the keyword ___NOISE___ is included in the key!
1021
          $text = substr($text, 0, $pos)
1022
            . 'UNDEFINED NOISE FOR KEY: '
1023
            . $key
1024
            . substr($text, $pos + 16);
1025
        }
1026
      } else {
1027
        // There is no valid key being given back to us... We must get
1028
        // rid of the ___noise___ or we will have a problem.
1029
        Debug::log_once('Noise restoration failed. The provided key is incomplete: ' . $text);
1030
        $text = substr($text, 0, $pos)
1031
          . 'NO NUMERIC NOISE KEY'
1032
          . substr($text, $pos + 11);
1033
      }
1034
    }
1035
1036
    return $text;
1037
  }
1038
1039
  public function search_noise($text)
1040
  {
1041
    foreach ($this->noise as $noiseElement) {
1042
      if (false !== strpos($noiseElement, $text)) {
1043
        return $noiseElement;
1044
      }
1045
    }
1046
  }
1047
1048
  public function __toString()
1049
  {
1050
    return $this->root->innertext();
1051
  }
1052
1053
  public function __get($name)
1054
  {
1055
    switch ($name) {
1056
      case 'outertext':
1057
        return $this->root->innertext();
1058
      case 'innertext':
1059
        return $this->root->innertext();
1060
      case 'plaintext':
1061
        return $this->root->text();
1062
      case 'charset':
1063
        return $this->_charset;
1064
      case 'target_charset':
1065
        return $this->_target_charset;
1066
    }
1067
  }
1068
1069
  public function childNodes($idx = -1)
1070
  {
1071
    return $this->root->childNodes($idx);
1072
  }
1073
1074
  public function firstChild()
1075
  {
1076
    return $this->root->firstChild();
1077
  }
1078
1079
  public function lastChild()
1080
  {
1081
    return $this->root->lastChild();
1082
  }
1083
1084
  public function createElement($name, $value = null)
1085
  {
1086
    $node = new HtmlNode(null);
1087
    $node->nodetype = HtmlNode::HDOM_TYPE_ELEMENT;
1088
    $node->_[HtmlNode::HDOM_INFO_BEGIN] = 1;
1089
    $node->_[HtmlNode::HDOM_INFO_END] = 1;
1090
1091
    if (null !== $value) {
1092
      $node->_[HtmlNode::HDOM_INFO_INNER] = $value;
1093
    }
1094
1095
    $node->tag = $name;
1096
1097
    return $node;
1098
  }
1099
1100
  public function createTextNode($value)
1101
  {
1102
    $node = new HtmlNode($this);
1103
    $node->nodetype = HtmlNode::HDOM_TYPE_TEXT;
1104
1105
    if (null !== $value) {
1106
      $node->_[HtmlNode::HDOM_INFO_TEXT] = $value;
1107
    }
1108
1109
    return $node;
1110
  }
1111
1112
  public function getElementById($id)
1113
  {
1114
    return $this->find("#$id", 0);
1115
  }
1116
1117
  public function getElementsById($id, $idx = null)
1118
  {
1119
    return $this->find("#$id", $idx);
1120
  }
1121
1122
  public function getElementByTagName($name)
1123
  {
1124
    return $this->find($name, 0);
1125
  }
1126
1127
  public function getElementsByTagName($name, $idx = null)
1128
  {
1129
    return $this->find($name, $idx);
1130
  }
1131
1132
  public function loadFile($file)
1133
  {
1134
    $args = func_get_args();
1135
1136
    if (false !== ($doc = call_user_func_array('file_get_contents', $args))) {
1137
      $this->load($doc, true);
1138
    } else {
1139
      return false;
1140
    }
1141
  }
1142
}
1143