Completed
Pull Request — master (#154)
by Christophe
01:50
created

Tokenizer::__construct()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 1

Importance

Changes 0
Metric Value
dl 0
loc 6
ccs 5
cts 5
cp 1
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 3
crap 1
1
<?php
2
namespace Masterminds\HTML5\Parser;
3
4
use Masterminds\HTML5\Elements;
5
6
/**
7
 * The HTML5 tokenizer.
8
 *
9
 * The tokenizer's role is reading data from the scanner and gathering it into
10
 * semantic units. From the tokenizer, data is emitted to an event handler,
11
 * which may (for example) create a DOM tree.
12
 *
13
 * The HTML5 specification has a detailed explanation of tokenizing HTML5. We
14
 * follow that specification to the maximum extent that we can. If you find
15
 * a discrepancy that is not documented, please file a bug and/or submit a
16
 * patch.
17
 *
18
 * This tokenizer is implemented as a recursive descent parser.
19
 *
20
 * Within the API documentation, you may see references to the specific section
21
 * of the HTML5 spec that the code attempts to reproduce. Example: 8.2.4.1.
22
 * This refers to section 8.2.4.1 of the HTML5 CR specification.
23
 *
24
 * @see http://www.w3.org/TR/2012/CR-html5-20121217/
25
 */
26
class Tokenizer
27
{
28
29
    protected $scanner;
30
31
    protected $events;
32
33
    protected $tok;
34
35
    /**
36
     * Buffer for text.
37
     */
38
    protected $text = '';
39
40
    // When this goes to false, the parser stops.
41
    protected $carryOn = true;
42
43
    protected $textMode = 0; // TEXTMODE_NORMAL;
44
    protected $untilTag = null;
45
46
    const CONFORMANT_XML = 'xml';
47
    const CONFORMANT_HTML = 'html';
48
    protected $mode = self::CONFORMANT_HTML;
49
50
    /**
51
     * Create a new tokenizer.
52
     *
53
     * Typically, parsing a document involves creating a new tokenizer, giving
54
     * it a scanner (input) and an event handler (output), and then calling
55
     * the Tokenizer::parse() method.`
56
     *
57
     * @param \Masterminds\HTML5\Parser\Scanner $scanner
58
     *            A scanner initialized with an input stream.
59
     * @param \Masterminds\HTML5\Parser\EventHandler $eventHandler
60
     *            An event handler, initialized and ready to receive
61
     *            events.
62
     * @param string $mode
63
     */
64 127
    public function __construct($scanner, $eventHandler, $mode = self::CONFORMANT_HTML)
65
    {
66 127
        $this->scanner = $scanner;
67 127
        $this->events = $eventHandler;
68 127
        $this->mode = $mode;
69 127
    }
70
71
    /**
72
     * Begin parsing.
73
     *
74
     * This will begin scanning the document, tokenizing as it goes.
75
     * Tokens are emitted into the event handler.
76
     *
77
     * Tokenizing will continue until the document is completely
78
     * read. Errors are emitted into the event handler, but
79
     * the parser will attempt to continue parsing until the
80
     * entire input stream is read.
81
     */
82 127
    public function parse()
83
    {
84
        do {
85 127
            $this->consumeData();
86
            // FIXME: Add infinite loop protection.
87 127
        } while ($this->carryOn);
88 127
    }
89
90
    /**
91
     * Set the text mode for the character data reader.
92
     *
93
     * HTML5 defines three different modes for reading text:
94
     * - Normal: Read until a tag is encountered.
95
     * - RCDATA: Read until a tag is encountered, but skip a few otherwise-
96
     * special characters.
97
     * - Raw: Read until a special closing tag is encountered (viz. pre, script)
98
     *
99
     * This allows those modes to be set.
100
     *
101
     * Normally, setting is done by the event handler via a special return code on
102
     * startTag(), but it can also be set manually using this function.
103
     *
104
     * @param integer $textmode
105
     *            One of Elements::TEXT_*
106
     * @param string $untilTag
107
     *            The tag that should stop RAW or RCDATA mode. Normal mode does not
108
     *            use this indicator.
109
     */
110 108
    public function setTextMode($textmode, $untilTag = null)
111
    {
112 108
        $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA);
113 108
        $this->untilTag = $untilTag;
114 108
    }
115
116
    /**
117
     * Consume a character and make a move.
118
     * HTML5 8.2.4.1
119
     */
120 127
    protected function consumeData()
121
    {
122 127
        $tok = $this->scanner->current();
123
124 127
        if ($tok === '&') {
125
            // Character reference
126 8
            $ref = $this->decodeCharacterReference();
127 8
            $this->buffer($ref);
128
129 8
            $tok = $this->scanner->current();
130 8
        }
131
132
        // Parse tag
133 127
        if ($tok === '<') {
134
            // Any buffered text data can go out now.
135 123
            $this->flushBuffer();
136
137 123
            $tok = $this->scanner->next();
138
139 123
            $this->markupDeclaration($tok)
140 120
                || $this->endTag()
141 120
                || $this->processingInstruction()
142 119
                || $this->tagName()
143
                // This always returns false.
144 114
                || $this->parseError("Illegal tag opening")
145 1
                || $this->characterData();
146
147 123
            $tok = $this->scanner->current();
148 123
        }
149
150
        // Handle end of document
151 127
        $this->eof($tok);
152
153
        // Parse character
154 127
        if ($tok !== false) {
155 112
            switch ($this->textMode) {
156 112
                case Elements::TEXT_RAW:
157 8
                    $this->rawText($tok);
158 8
                    break;
159
160 112
                case Elements::TEXT_RCDATA:
161 37
                    $this->rcdata($tok);
162 37
                    break;
163
164 111
                default:
165 111
                    if ($tok !== '<' && $tok !== '&') {
166
                        // NULL character
167 87
                        if ($tok === "\00") {
168
                            $this->parseError("Received null character.");
169
                        }
170
171 87
                        $this->text .= $tok;
172 87
                        $this->scanner->next();
173 87
                    }
174 112
            }
175 112
        }
176
177 127
        return $this->carryOn;
178
    }
179
180
    /**
181
     * Parse anything that looks like character data.
182
     *
183
     * Different rules apply based on the current text mode.
184
     *
185
     * @see Elements::TEXT_RAW Elements::TEXT_RCDATA.
186
     */
187 1
    protected function characterData()
188
    {
189 1
        $tok = $this->scanner->current();
190 1
        if ($tok === false) {
191
            return false;
192
        }
193 1
        switch ($this->textMode) {
194 1
            case Elements::TEXT_RAW:
195
                return $this->rawText($tok);
196 1
            case Elements::TEXT_RCDATA:
197
                return $this->rcdata($tok);
198 1
            default:
199 1
                if ($tok === '<' || $tok === '&') {
200
                    return false;
201
                }
202 1
                return $this->text($tok);
203 1
        }
204
    }
205
206
    /**
207
     * This buffers the current token as character data.
208
     *
209
     * @param string $tok The current token.
210
     *
211
     * @return bool
212
     */
213 1
    protected function text($tok)
214
    {
215
        // This should never happen...
216 1
        if ($tok === false) {
217
            return false;
218
        }
219
220
        // NULL character
221 1
        if ($tok === "\00") {
222
            $this->parseError("Received null character.");
223
        }
224
225 1
        $this->buffer($tok);
226 1
        $this->scanner->next();
227
228 1
        return true;
229
    }
230
231
    /**
232
     * Read text in RAW mode.
233
     *
234
     * @param string $tok The current token.
235
     *
236
     * @return bool
237
     */
238 8
    protected function rawText($tok)
239
    {
240 8
        if (is_null($this->untilTag)) {
241
            return $this->text($tok);
242
        }
243
244 8
        $sequence = '</' . $this->untilTag . '>';
245 8
        $txt = $this->readUntilSequence($sequence);
246 8
        $this->events->text($txt);
247 8
        $this->setTextMode(0);
248
249 8
        return $this->endTag();
250
    }
251
252
    /**
253
     * Read text in RCDATA mode.
254
     *
255
     * @param string $tok The current token.
256
     *
257
     * @return bool
258
     */
259 37
    protected function rcdata($tok)
260
    {
261 37
        if (is_null($this->untilTag)) {
262
            return $this->text($tok);
263
        }
264
265 37
        $sequence = '</' . $this->untilTag;
266 37
        $txt = '';
267
268 37
        $caseSensitive = !Elements::isHtml5Element($this->untilTag);
269 37
        while ($tok !== false && ! ($tok == '<' && ($this->scanner->sequenceMatches($sequence, $caseSensitive)))) {
270 35
            if ($tok == '&') {
271 1
                $txt .= $this->decodeCharacterReference();
272 1
                $tok = $this->scanner->current();
273 1
            } else {
274 35
                $txt .= $tok;
275 35
                $tok = $this->scanner->next();
276
            }
277 35
        }
278 37
        $len = strlen($sequence);
279 37
        $this->scanner->consume($len);
280 37
        $len += strlen($this->scanner->whitespace());
281 37
        if ($this->scanner->current() !== '>') {
282
            $this->parseError("Unclosed RCDATA end tag");
283
        }
284
285 37
        $this->scanner->unconsume($len);
286 37
        $this->events->text($txt);
287 37
        $this->setTextMode(0);
288
289 37
        return $this->endTag();
290
    }
291
292
    /**
293
     * If the document is read, emit an EOF event.
294
     */
295 127
    protected function eof($tok)
296
    {
297 127
        if ($tok === false) {
298
            // fprintf(STDOUT, "EOF");
299 127
            $this->flushBuffer();
300 127
            $this->events->eof();
301 127
            $this->carryOn = false;
302
303 127
            return true;
304
        }
305
306 112
        return false;
307
    }
308
309
    /**
310
     * Look for markup.
311
     */
312 123
    protected function markupDeclaration($tok)
313
    {
314 123
        if ($tok != '!') {
315 120
            return false;
316
        }
317
318 101
        $tok = $this->scanner->next();
319
320
        // Comment:
321 101
        if ($tok == '-' && $this->scanner->peek() == '-') {
322 6
            $this->scanner->next(); // Consume the other '-'
323 6
            $this->scanner->next(); // Next char.
324 6
            return $this->comment();
325
        }
326
327 98
        elseif ($tok == 'D' || $tok == 'd') { // Doctype
328 96
            return $this->doctype();
329
        }
330
331 7
        elseif ($tok == '[') { // CDATA section
332 7
            return $this->cdataSection();
333
        }
334
335
        // FINISH
336 1
        $this->parseError("Expected <!--, <![CDATA[, or <!DOCTYPE. Got <!%s", $tok);
337 1
        $this->bogusComment('<!');
338 1
        return true;
339
    }
340
341
    /**
342
     * Consume an end tag.
343
     * 8.2.4.9
344
     */
345 120
    protected function endTag()
346
    {
347 120
        if ($this->scanner->current() != '/') {
348 119
            return false;
349
        }
350 111
        $tok = $this->scanner->next();
351
352
        // a-zA-Z -> tagname
353
        // > -> parse error
354
        // EOF -> parse error
355
        // -> parse error
356 111
        if (! ctype_alpha($tok)) {
357 2
            $this->parseError("Expected tag name, got '%s'", $tok);
358 2
            if ($tok == "\0" || $tok === false) {
359
                return false;
360
            }
361 2
            return $this->bogusComment('</');
362
        }
363
364 110
        $name = $this->scanner->charsUntil("\n\f \t>");
365 110
        $name = $this->mode === self::CONFORMANT_XML ? $name: strtolower($name);
366
        // Trash whitespace.
367 110
        $this->scanner->whitespace();
368
369 110
        $tok = $this->scanner->current();
370 110
        if ($tok != '>') {
371 1
            $this->parseError("Expected >, got '%s'", $tok);
372
            // We just trash stuff until we get to the next tag close.
373 1
            $this->scanner->charsUntil('>');
374 1
        }
375
376 110
        $this->events->endTag($name);
377 110
        $this->scanner->next();
378 110
        return true;
379
    }
380
381
    /**
382
     * Consume a tag name and body.
383
     * 8.2.4.10
384
     */
385 114
    protected function tagName()
386
    {
387 114
        $tok = $this->scanner->current();
388 114
        if (! ctype_alpha($tok)) {
389 1
            return false;
390
        }
391
392
        // We know this is at least one char.
393 114
        $name = $this->scanner->charsWhile(":_-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
394 114
        $name = $this->mode === self::CONFORMANT_XML ? $name : strtolower($name);
395 114
        $attributes = array();
396 114
        $selfClose = false;
397
398
        // Handle attribute parse exceptions here so that we can
399
        // react by trying to build a sensible parse tree.
400
        try {
401
            do {
402 114
                $this->scanner->whitespace();
403 114
                $this->attribute($attributes);
404 114
            } while (! $this->isTagEnd($selfClose));
405 114
        } catch (ParseError $e) {
406 2
            $selfClose = false;
407
        }
408
409 114
        $mode = $this->events->startTag($name, $attributes, $selfClose);
410
411 114
        if (is_int($mode)) {
412 107
            $this->setTextMode($mode, $name);
413 107
        }
414
415 114
        $this->scanner->next();
416
417 114
        return true;
418
    }
419
420
    /**
421
     * Check if the scanner has reached the end of a tag.
422
     */
423 114
    protected function isTagEnd(&$selfClose)
424
    {
425 114
        $tok = $this->scanner->current();
426 114
        if ($tok == '/') {
427 15
            $this->scanner->next();
428 15
            $this->scanner->whitespace();
429 15
            $tok = $this->scanner->current();
430
431 15
            if ($tok == '>') {
432 15
                $selfClose = true;
433 15
                return true;
434
            }
435 2
            if ($tok === false) {
436 1
                $this->parseError("Unexpected EOF inside of tag.");
437 1
                return true;
438
            }
439
            // Basically, we skip the / token and go on.
440
            // See 8.2.4.43.
441 1
            $this->parseError("Unexpected '%s' inside of a tag.", $tok);
442 1
            return false;
443
        }
444
445 114
        if ($tok == '>') {
446 114
            return true;
447
        }
448 32
        if ($tok === false) {
449 2
            $this->parseError("Unexpected EOF inside of tag.");
450 2
            return true;
451
        }
452
453 31
        return false;
454
    }
455
456
    /**
457
     * Parse attributes from inside of a tag.
458
     *
459
     * @param string[] $attributes
460
     *
461
     * @return bool
462
     *
463
     * @throws ParseError
464
     */
465 114
    protected function attribute(&$attributes)
466
    {
467 114
        $tok = $this->scanner->current();
468 114
        if ($tok == '/' || $tok == '>' || $tok === false) {
469 108
            return false;
470
        }
471
472 82
        if ($tok == '<') {
473 2
            $this->parseError("Unexpected '<' inside of attributes list.");
474
            // Push the < back onto the stack.
475 2
            $this->scanner->unconsume();
476
            // Let the caller figure out how to handle this.
477 2
            throw new ParseError("Start tag inside of attribute.");
478
        }
479
480 82
        $name = strtolower($this->scanner->charsUntil("/>=\n\f\t "));
481
482 82
        if (strlen($name) == 0) {
483 3
            $tok = $this->scanner->current();
484 3
            $this->parseError("Expected an attribute name, got %s.", $tok);
485
            // Really, only '=' can be the char here. Everything else gets absorbed
486
            // under one rule or another.
487 3
            $name = $tok;
488 3
            $this->scanner->next();
489 3
        }
490
491 82
        $isValidAttribute = true;
492
        // Attribute names can contain most Unicode characters for HTML5.
493
        // But method "DOMElement::setAttribute" is throwing exception
494
        // because of it's own internal restriction so these have to be filtered.
495
        // see issue #23: https://github.com/Masterminds/html5-php/issues/23
496
        // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
497 82
        if (preg_match("/[\x1-\x2C\\/\x3B-\x40\x5B-\x5E\x60\x7B-\x7F]/u", $name)) {
498 4
            $this->parseError("Unexpected characters in attribute name: %s", $name);
499 4
            $isValidAttribute = false;
500 4
        }         // There is no limitation for 1st character in HTML5.
501
        // But method "DOMElement::setAttribute" is throwing exception for the
502
        // characters below so they have to be filtered.
503
        // see issue #23: https://github.com/Masterminds/html5-php/issues/23
504
        // and http://www.w3.org/TR/2011/WD-html5-20110525/syntax.html#syntax-attribute-name
505
        else
506 79
            if (preg_match("/^[0-9.-]/u", $name)) {
507 1
                $this->parseError("Unexpected character at the begining of attribute name: %s", $name);
508 1
                $isValidAttribute = false;
509 1
            }
510
        // 8.1.2.3
511 82
        $this->scanner->whitespace();
512
513 82
        $val = $this->attributeValue();
514 82
        if ($isValidAttribute) {
515 79
            $attributes[$name] = $val;
516 79
        }
517 82
        return true;
518
    }
519
520
    /**
521
     * Consume an attribute value.
522
     * 8.2.4.37 and after.
523
     *
524
     * @return string|null
525
     */
526 82
    protected function attributeValue()
527
    {
528 82
        if ($this->scanner->current() != '=') {
529 13
            return null;
530
        }
531 78
        $this->scanner->next();
532
        // 8.1.2.3
533 78
        $this->scanner->whitespace();
534
535 78
        $tok = $this->scanner->current();
536
        switch ($tok) {
537 78
            case "\n":
538 78
            case "\f":
539 78
            case " ":
540 78
            case "\t":
541
                // Whitespace here indicates an empty value.
542
                return null;
543 78
            case '"':
544 78
            case "'":
545 78
                $this->scanner->next();
546 78
                return $this->quotedAttributeValue($tok);
0 ignored issues
show
Security Bug introduced by
It seems like $tok defined by $this->scanner->current() on line 535 can also be of type false; however, Masterminds\HTML5\Parser...:quotedAttributeValue() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
547 1
            case '>':
548
                // case '/': // 8.2.4.37 seems to allow foo=/ as a valid attr.
549 1
                $this->parseError("Expected attribute value, got tag end.");
550 1
                return null;
551 1
            case '=':
552 1
            case '`':
553
                $this->parseError("Expecting quotes, got %s.", $tok);
554
                return $this->unquotedAttributeValue();
555 1
            default:
556 1
                return $this->unquotedAttributeValue();
557 1
        }
558
    }
559
560
    /**
561
     * Get an attribute value string.
562
     *
563
     * @param string $quote
564
     *            IMPORTANT: This is a series of chars! Any one of which will be considered
565
     *            termination of an attribute's value. E.g. "\"'" will stop at either
566
     *            ' or ".
567
     * @return string The attribute value.
568
     */
569 78
    protected function quotedAttributeValue($quote)
570
    {
571 78
        $stoplist = "\f" . $quote;
572 78
        $val = '';
573
574 78
        while (true) {
575 78
            $tokens = $this->scanner->charsUntil($stoplist.'&');
576 78
            if ($tokens !== false) {
577 78
                $val .= $tokens;
578 78
            } else {
579
                break;
580
            }
581
582 78
            $tok = $this->scanner->current();
583 78
            if ($tok == '&') {
584 3
                $val .= $this->decodeCharacterReference(true);
585 3
                continue;
586
            }
587 78
            break;
588
        }
589 78
        $this->scanner->next();
590 78
        return $val;
591
    }
592
593 1
    protected function unquotedAttributeValue()
594
    {
595 1
        $val = '';
596 1
        $tok = $this->scanner->current();
597 1
        while ($tok !== "\t" && $tok !== "\n" && $tok !== "\f" && $tok !== ' ' && $tok !== '>' && $tok !== false) {
598 1
            if ($tok == '&') {
599 1
                $val .= $this->decodeCharacterReference(true);
600 1
                $tok = $this->scanner->current();
601 1
            } else {
602 1
                if ($tok === '"' || $tok === '\'' || $tok === '<' || $tok === '=' || $tok === '`') {
603 1
                    $this->parseError("Unexpected chars in unquoted attribute value %s", $tok);
604 1
                }
605 1
                $val .= $tok;
606 1
                $tok = $this->scanner->next();
607
            }
608 1
        }
609 1
        return $val;
610
    }
611
612
    /**
613
     * Consume malformed markup as if it were a comment.
614
     * 8.2.4.44
615
     *
616
     * The spec requires that the ENTIRE tag-like thing be enclosed inside of
617
     * the comment. So this will generate comments like:
618
     *
619
     * &lt;!--&lt/+foo&gt;--&gt;
620
     *
621
     * @param string $leading
622
     *            Prepend any leading characters. This essentially
623
     *            negates the need to backtrack, but it's sort of
624
     *            a hack.
625
     *
626
     * @return bool
627
     */
628 3
    protected function bogusComment($leading = '')
629
    {
630 3
        $comment = $leading;
631 3
        $tokens = $this->scanner->charsUntil('>');
632 3
        if ($tokens !== false) {
633 2
            $comment .= $tokens;
634 2
        }
635 3
        $tok = $this->scanner->current();
636 3
        if ($tok !== false) {
637 2
            $comment .= $tok;
638 2
        }
639
640 3
        $this->flushBuffer();
641 3
        $this->events->comment($comment);
642 3
        $this->scanner->next();
643
644 3
        return true;
645
    }
646
647
    /**
648
     * Read a comment.
649
     *
650
     * Expects the first tok to be inside of the comment.
651
     *
652
     * @return bool
653
     */
654 6
    protected function comment()
655
    {
656 6
        $tok = $this->scanner->current();
657 6
        $comment = '';
658
659
        // <!-->. Emit an empty comment because 8.2.4.46 says to.
660 6
        if ($tok == '>') {
661
            // Parse error. Emit the comment token.
662 1
            $this->parseError("Expected comment data, got '>'");
663 1
            $this->events->comment('');
664 1
            $this->scanner->next();
665 1
            return true;
666
        }
667
668
        // Replace NULL with the replacement char.
669 6
        if ($tok == "\0") {
670
            $tok = UTF8Utils::FFFD;
671
        }
672 6
        while (! $this->isCommentEnd()) {
673 6
            $comment .= $tok;
674 6
            $tok = $this->scanner->next();
675 6
        }
676
677 6
        $this->events->comment($comment);
678 6
        $this->scanner->next();
679 6
        return true;
680
    }
681
682
    /**
683
     * Check if the scanner has reached the end of a comment.
684
     *
685
     * @return bool
686
     */
687 6
    protected function isCommentEnd()
688
    {
689 6
        $tok = $this->scanner->current();
690
691
        // EOF
692 6
        if ($tok === false) {
693
            // Hit the end.
694 1
            $this->parseError("Unexpected EOF in a comment.");
695 1
            return true;
696
        }
697
698
        // If it doesn't start with -, not the end.
699 6
        if ($tok != '-') {
700 6
            return false;
701
        }
702
703
        // Advance one, and test for '->'
704 6
        if ($this->scanner->next() == '-' && $this->scanner->peek() == '>') {
705 6
            $this->scanner->next(); // Consume the last '>'
706 6
            return true;
707
        }
708
        // Unread '-';
709 2
        $this->scanner->unconsume(1);
710 2
        return false;
711
    }
712
713
    /**
714
     * Parse a DOCTYPE.
715
     *
716
     * Parse a DOCTYPE declaration. This method has strong bearing on whether or
717
     * not Quirksmode is enabled on the event handler.
718
     *
719
     * @todo This method is a little long. Should probably refactor.
720
     *
721
     * @return bool
722
     */
723 96
    protected function doctype()
724
    {
725 96
        if (strcasecmp($this->scanner->current(), 'D')) {
726
            return false;
727
        }
728
        // Check that string is DOCTYPE.
729 96
        $chars = $this->scanner->charsWhile("DOCTYPEdoctype");
730 96
        if (strcasecmp($chars, 'DOCTYPE')) {
731 1
            $this->parseError('Expected DOCTYPE, got %s', $chars);
732 1
            return $this->bogusComment('<!' . $chars);
733
        }
734
735 95
        $this->scanner->whitespace();
736 95
        $tok = $this->scanner->current();
737
738
        // EOF: die.
739 95
        if ($tok === false) {
740
            $this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true);
741
            return $this->eof($tok);
742
        }
743
744
        // NULL char: convert.
745 95
        if ($tok === "\0") {
746
            $this->parseError("Unexpected null character in DOCTYPE.");
747
        }
748
749 95
        $stop = " \n\f>";
750 95
        $doctypeName = $this->scanner->charsUntil($stop);
751
        // Lowercase ASCII, replace \0 with FFFD
752 95
        $doctypeName = strtolower(strtr($doctypeName, "\0", UTF8Utils::FFFD));
0 ignored issues
show
Security Bug introduced by
It seems like $doctypeName can also be of type false; however, strtr() does only seem to accept string, did you maybe forget to handle an error condition?
Loading history...
753
754 95
        $tok = $this->scanner->current();
755
756
        // If false, emit a parse error, DOCTYPE, and return.
757 95
        if ($tok === false) {
758 1
            $this->parseError('Unexpected EOF in DOCTYPE declaration.');
759 1
            $this->events->doctype($doctypeName, EventHandler::DOCTYPE_NONE, null, true);
760 1
            return true;
761
        }
762
763
        // Short DOCTYPE, like <!DOCTYPE html>
764 95
        if ($tok == '>') {
765
            // DOCTYPE without a name.
766 95
            if (strlen($doctypeName) == 0) {
767 1
                $this->parseError("Expected a DOCTYPE name. Got nothing.");
768 1
                $this->events->doctype($doctypeName, 0, null, true);
769 1
                $this->scanner->next();
770 1
                return true;
771
            }
772 95
            $this->events->doctype($doctypeName);
773 95
            $this->scanner->next();
774 95
            return true;
775
        }
776 1
        $this->scanner->whitespace();
777
778 1
        $pub = strtoupper($this->scanner->getAsciiAlpha());
779 1
        $white = strlen($this->scanner->whitespace());
780
781
        // Get ID, and flag it as pub or system.
782 1
        if (($pub == 'PUBLIC' || $pub == 'SYSTEM') && $white > 0) {
783
            // Get the sys ID.
784 1
            $type = $pub == 'PUBLIC' ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM;
785 1
            $id = $this->quotedString("\0>");
786 1
            if ($id === false) {
787
                $this->events->doctype($doctypeName, $type, $pub, false);
788
                return false;
789
            }
790
791
            // Premature EOF.
792 1
            if ($this->scanner->current() === false) {
793 1
                $this->parseError("Unexpected EOF in DOCTYPE");
794 1
                $this->events->doctype($doctypeName, $type, $id, true);
795 1
                return true;
796
            }
797
798
            // Well-formed complete DOCTYPE.
799 1
            $this->scanner->whitespace();
800 1
            if ($this->scanner->current() == '>') {
801 1
                $this->events->doctype($doctypeName, $type, $id, false);
802 1
                $this->scanner->next();
803 1
                return true;
804
            }
805
806
            // If we get here, we have <!DOCTYPE foo PUBLIC "bar" SOME_JUNK
807
            // Throw away the junk, parse error, quirks mode, return true.
808 1
            $this->scanner->charsUntil(">");
809 1
            $this->parseError("Malformed DOCTYPE.");
810 1
            $this->events->doctype($doctypeName, $type, $id, true);
811 1
            $this->scanner->next();
812 1
            return true;
813
        }
814
815
        // Else it's a bogus DOCTYPE.
816
        // Consume to > and trash.
817 1
        $this->scanner->charsUntil('>');
818
819 1
        $this->parseError("Expected PUBLIC or SYSTEM. Got %s.", $pub);
820 1
        $this->events->doctype($doctypeName, 0, null, true);
821 1
        $this->scanner->next();
822 1
        return true;
823
    }
824
825
    /**
826
     * Utility for reading a quoted string.
827
     *
828
     * @param string $stopchars
829
     *            Characters (in addition to a close-quote) that should stop the string.
830
     *            E.g. sometimes '>' is higher precedence than '"' or "'".
831
     *
832
     * @return mixed String if one is found (quotations omitted)
833
     */
834 1
    protected function quotedString($stopchars)
835
    {
836 1
        $tok = $this->scanner->current();
837 1
        if ($tok == '"' || $tok == "'") {
838 1
            $this->scanner->next();
839 1
            $ret = $this->scanner->charsUntil($tok . $stopchars);
840 1
            if ($this->scanner->current() == $tok) {
841 1
                $this->scanner->next();
842 1
            } else {
843
                // Parse error because no close quote.
844
                $this->parseError("Expected %s, got %s", $tok, $this->scanner->current());
845
            }
846 1
            return $ret;
847
        }
848
        return false;
849
    }
850
851
    /**
852
     * Handle a CDATA section.
853
     *
854
     * @return bool
855
     */
856 7
    protected function cdataSection()
857
    {
858 7
        if ($this->scanner->current() != '[') {
859
            return false;
860
        }
861 7
        $cdata = '';
862 7
        $this->scanner->next();
863
864 7
        $chars = $this->scanner->charsWhile('CDAT');
865 7
        if ($chars != 'CDATA' || $this->scanner->current() != '[') {
866 1
            $this->parseError('Expected [CDATA[, got %s', $chars);
867 1
            return $this->bogusComment('<![' . $chars);
868
        }
869
870 7
        $tok = $this->scanner->next();
871
        do {
872 7
            if ($tok === false) {
873 2
                $this->parseError('Unexpected EOF inside CDATA.');
874 2
                $this->bogusComment('<![CDATA[' . $cdata);
875 2
                return true;
876
            }
877 7
            $cdata .= $tok;
878 7
            $tok = $this->scanner->next();
879 7
        } while (! $this->scanner->sequenceMatches(']]>'));
880
881
        // Consume ]]>
882 5
        $this->scanner->consume(3);
883
884 5
        $this->events->cdata($cdata);
885 5
        return true;
886
    }
887
888
    // ================================================================
889
    // Non-HTML5
890
    // ================================================================
891
    /**
892
     * Handle a processing instruction.
893
     *
894
     * XML processing instructions are supposed to be ignored in HTML5,
895
     * treated as "bogus comments". However, since we're not a user
896
     * agent, we allow them. We consume until ?> and then issue a
897
     * EventListener::processingInstruction() event.
898
     *
899
     * @return bool
900
     */
901 119
    protected function processingInstruction()
902
    {
903 119
        if ($this->scanner->current() != '?') {
904 114
            return false;
905
        }
906
907 7
        $tok = $this->scanner->next();
908 7
        $procName = $this->scanner->getAsciiAlpha();
909 7
        $white = strlen($this->scanner->whitespace());
910
911
        // If not a PI, send to bogusComment.
912 7
        if (strlen($procName) == 0 || $white == 0 || $this->scanner->current() == false) {
913 1
            $this->parseError("Expected processing instruction name, got $tok");
914 1
            $this->bogusComment('<?' . $tok . $procName);
915 1
            return true;
916
        }
917
918 6
        $data = '';
919
        // As long as it's not the case that the next two chars are ? and >.
920 6
        while (! ($this->scanner->current() == '?' && $this->scanner->peek() == '>')) {
921 6
            $data .= $this->scanner->current();
922
923 6
            $tok = $this->scanner->next();
924 6
            if ($tok === false) {
925
                $this->parseError("Unexpected EOF in processing instruction.");
926
                $this->events->processingInstruction($procName, $data);
0 ignored issues
show
Security Bug introduced by
It seems like $procName defined by $this->scanner->getAsciiAlpha() on line 908 can also be of type false; however, Masterminds\HTML5\Parser...processingInstruction() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
927
                return true;
928
            }
929 6
        }
930
931 6
        $this->scanner->next(); // >
932 6
        $this->scanner->next(); // Next token.
933 6
        $this->events->processingInstruction($procName, $data);
0 ignored issues
show
Security Bug introduced by
It seems like $procName defined by $this->scanner->getAsciiAlpha() on line 908 can also be of type false; however, Masterminds\HTML5\Parser...processingInstruction() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
934 6
        return true;
935
    }
936
937
    // ================================================================
938
    // UTILITY FUNCTIONS
939
    // ================================================================
940
941
    /**
942
     * Read from the input stream until we get to the desired sequene
943
     * or hit the end of the input stream.
944
     *
945
     * @param string $sequence
946
     *
947
     * @return string
948
     */
949 8
    protected function readUntilSequence($sequence)
950
    {
951 8
        $buffer = '';
952
953
        // Optimization for reading larger blocks faster.
954 8
        $first = substr($sequence, 0, 1);
955 8
        while ($this->scanner->current() !== false) {
956 8
            $buffer .= $this->scanner->charsUntil($first);
957
958
            // Stop as soon as we hit the stopping condition.
959 8
            if ($this->scanner->sequenceMatches($sequence, false)) {
960 8
                return $buffer;
961
            }
962 4
            $buffer .= $this->scanner->current();
963 4
            $this->scanner->next();
964 4
        }
965
966
        // If we get here, we hit the EOF.
967 1
        $this->parseError("Unexpected EOF during text read.");
968 1
        return $buffer;
969
    }
970
971
    /**
972
     * Check if upcomming chars match the given sequence.
973
     *
974
     * This will read the stream for the $sequence. If it's
975
     * found, this will return true. If not, return false.
976
     * Since this unconsumes any chars it reads, the caller
977
     * will still need to read the next sequence, even if
978
     * this returns true.
979
     *
980
     * Example: $this->scanner->sequenceMatches('</script>') will
981
     * see if the input stream is at the start of a
982
     * '</script>' string.
983
     *
984
     * @param string $sequence
985
     * @param bool $caseSensitive
986
     *
987
     * @return bool
988
     */
989
    protected function sequenceMatches($sequence, $caseSensitive = true)
990
    {
991
        @trigger_error(__METHOD__ . ' method is deprecated since version 2.4 and will be removed in 3.0. Use Scanner::sequenceMatches() instead.', E_USER_DEPRECATED);
992
993
        return $this->scanner->sequenceMatches($sequence, $caseSensitive);
994
    }
995
996
    /**
997
     * Send a TEXT event with the contents of the text buffer.
998
     *
999
     * This emits an EventHandler::text() event with the current contents of the
1000
     * temporary text buffer. (The buffer is used to group as much PCDATA
1001
     * as we can instead of emitting lots and lots of TEXT events.)
1002
     */
1003 127
    protected function flushBuffer()
1004
    {
1005 127
        if ($this->text === '') {
1006 125
            return;
1007
        }
1008 87
        $this->events->text($this->text);
1009 87
        $this->text = '';
1010 87
    }
1011
1012
    /**
1013
     * Add text to the temporary buffer.
1014
     *
1015
     * @see flushBuffer()
1016
     *
1017
     * @param string $str
1018
     */
1019 9
    protected function buffer($str)
1020
    {
1021 9
        $this->text .= $str;
1022 9
    }
1023
1024
    /**
1025
     * Emit a parse error.
1026
     *
1027
     * A parse error always returns false because it never consumes any
1028
     * characters.
1029
     *
1030
     * @param string $msg
1031
     *
1032
     * @return string
1033
     */
1034 15
    protected function parseError($msg)
1035
    {
1036 15
        $args = func_get_args();
1037
1038 15
        if (count($args) > 1) {
1039 11
            array_shift($args);
1040 11
            $msg = vsprintf($msg, $args);
1041 11
        }
1042
1043 15
        $line = $this->scanner->currentLine();
1044 15
        $col = $this->scanner->columnOffset();
1045 15
        $this->events->parseError($msg, $line, $col);
1046
1047 15
        return false;
1048
    }
1049
1050
    /**
1051
     * Decode a character reference and return the string.
1052
     *
1053
     * If $inAttribute is set to true, a bare & will be returned as-is.
1054
     *
1055
     * @param bool $inAttribute
1056
     *            Set to true if the text is inside of an attribute value.
1057
     *            false otherwise.
1058
     *
1059
     * @return string
1060
     */
1061 12
    protected function decodeCharacterReference($inAttribute = false)
1062
    {
1063
        // Next char after &.
1064 12
        $tok = $this->scanner->next();
1065 12
        $start = $this->scanner->position();
1066
1067 12
        if ($tok == false) {
1068 1
            return '&';
1069
        }
1070
1071
        // These indicate not an entity. We return just
1072
        // the &.
1073 12
        if ($tok === "\t" || $tok === "\n" || $tok === "\f" ||$tok === ' ' || $tok === '&' || $tok === '<') {
1074
            // $this->scanner->next();
1075 2
            return '&';
1076
        }
1077
1078
        // Numeric entity
1079 12
        if ($tok == '#') {
1080 2
            $tok = $this->scanner->next();
1081
1082
            // Hexidecimal encoding.
1083
            // X[0-9a-fA-F]+;
1084
            // x[0-9a-fA-F]+;
1085 2
            if ($tok == 'x' || $tok == 'X') {
1086 2
                $tok = $this->scanner->next(); // Consume x
1087
1088
                // Convert from hex code to char.
1089 2
                $hex = $this->scanner->getHex();
1090 2
                if (empty($hex)) {
1091
                    $this->parseError("Expected &#xHEX;, got &#x%s", $tok);
1092
                    // We unconsume because we don't know what parser rules might
1093
                    // be in effect for the remaining chars. For example. '&#>'
1094
                    // might result in a specific parsing rule inside of tag
1095
                    // contexts, while not inside of pcdata context.
1096
                    $this->scanner->unconsume(2);
1097
                    return '&';
1098
                }
1099 2
                $entity = CharacterReference::lookupHex($hex);
1100 2
            }             // Decimal encoding.
1101
            // [0-9]+;
1102
            else {
1103
                // Convert from decimal to char.
1104 1
                $numeric = $this->scanner->getNumeric();
1105 1
                if ($numeric === false) {
1106
                    $this->parseError("Expected &#DIGITS;, got &#%s", $tok);
1107
                    $this->scanner->unconsume(2);
1108
                    return '&';
1109
                }
1110 1
                $entity = CharacterReference::lookupDecimal($numeric);
1111
            }
1112 12
        } elseif ($tok === '=' && $inAttribute) {
1113 1
            return '&';
1114
        } else { // String entity.
1115
1116
            // Attempt to consume a string up to a ';'.
1117
            // [a-zA-Z0-9]+;
1118 11
            $cname = $this->scanner->getAsciiAlphaNum();
1119 11
            $entity = CharacterReference::lookupName($cname);
0 ignored issues
show
Security Bug introduced by
It seems like $cname defined by $this->scanner->getAsciiAlphaNum() on line 1118 can also be of type false; however, Masterminds\HTML5\Parser...Reference::lookupName() does only seem to accept string, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
1120
1121
            // When no entity is found provide the name of the unmatched string
1122
            // and continue on as the & is not part of an entity. The & will
1123
            // be converted to &amp; elsewhere.
1124 11
            if ($entity == null) {
1125 6
                if (!$inAttribute || strlen($cname) === 0) {
1126 5
                    $this->parseError("No match in entity table for '%s'", $cname);
1127 5
                }
1128 6
                $this->scanner->unconsume($this->scanner->position() - $start);
1129 6
                return '&';
1130
            }
1131
        }
1132
1133
        // The scanner has advanced the cursor for us.
1134 9
        $tok = $this->scanner->current();
1135
1136
        // We have an entity. We're done here.
1137 9
        if ($tok == ';') {
1138 9
            $this->scanner->next();
1139 9
            return $entity;
1140
        }
1141
1142
        // If in an attribute, then failing to match ; means unconsume the
1143
        // entire string. Otherwise, failure to match is an error.
1144 1
        if ($inAttribute) {
1145
            $this->scanner->unconsume($this->scanner->position() - $start);
1146
            return '&';
1147
        }
1148
1149 1
        $this->parseError("Expected &ENTITY;, got &ENTITY%s (no trailing ;) ", $tok);
1150 1
        return '&' . $entity;
1151
    }
1152
}
1153