Completed
Push — master ( ed92f4...60669a )
by Daniel
03:02
created

thirdparty/html5lib/HTML5/Tokenizer.php (23 issues)

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
3
/*
4
5
Copyright 2007 Jeroen van der Meer <http://jero.net/>
6
Copyright 2008 Edward Z. Yang <http://htmlpurifier.org/>
7
Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
8
9
Permission is hereby granted, free of charge, to any person obtaining a
10
copy of this software and associated documentation files (the
11
"Software"), to deal in the Software without restriction, including
12
without limitation the rights to use, copy, modify, merge, publish,
13
distribute, sublicense, and/or sell copies of the Software, and to
14
permit persons to whom the Software is furnished to do so, subject to
15
the following conditions:
16
17
The above copyright notice and this permission notice shall be included
18
in all copies or substantial portions of the Software.
19
20
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
28
*/
29
30
// Some conventions:
31
// /* */ indicates verbatim text from the HTML 5 specification
32
// // indicates regular comments
33
34
// all flags are in hyphenated form
35
36
class HTML5_Tokenizer {
37
    /**
38
     * Points to an InputStream object.
39
     */
40
    protected $stream;
41
42
    /**
43
     * Tree builder that the tokenizer emits token to.
44
     */
45
    private $tree;
46
47
    /**
48
     * Current content model we are parsing as.
49
     */
50
    protected $content_model;
51
52
    /**
53
     * Current token that is being built, but not yet emitted. Also
54
     * is the last token emitted, if applicable.
55
     */
56
    protected $token;
57
58
    // These are constants describing the content model
59
    const PCDATA    = 0;
60
    const RCDATA    = 1;
61
    const CDATA     = 2;
62
    const PLAINTEXT = 3;
63
64
    // These are constants describing tokens
65
    // XXX should probably be moved somewhere else, probably the
66
    // HTML5 class.
67
    const DOCTYPE        = 0;
68
    const STARTTAG       = 1;
69
    const ENDTAG         = 2;
70
    const COMMENT        = 3;
71
    const CHARACTER      = 4;
72
    const SPACECHARACTER = 5;
73
    const EOF            = 6;
74
    const PARSEERROR     = 7;
75
76
    // These are constants representing bunches of characters.
77
    const ALPHA       = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
78
    const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
79
    const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz';
80
    const DIGIT       = '0123456789';
81
    const HEX         = '0123456789ABCDEFabcdef';
82
    const WHITESPACE  = "\t\n\x0c ";
83
84
    /**
85
     * @param $data Data to parse
86
     */
87
    public function __construct($data, $builder = null) {
88
        $this->stream = new HTML5_InputStream($data);
89
        if (!$builder) $this->tree = new HTML5_TreeBuilder;
90
        else $this->tree = $builder;
91
        $this->content_model = self::PCDATA;
92
    }
93
94
    public function parseFragment($context = null) {
95
        $this->tree->setupContext($context);
96
        if ($this->tree->content_model) {
97
            $this->content_model = $this->tree->content_model;
98
            $this->tree->content_model = null;
99
        }
100
        $this->parse();
101
    }
102
103
    // XXX maybe convert this into an iterator? regardless, this function
104
    // and the save function should go into a Parser facade of some sort
105
    /**
106
     * Performs the actual parsing of the document.
107
     */
108
    public function parse() {
109
        // Current state
110
        $state = 'data';
111
        // This is used to avoid having to have look-behind in the data state.
112
        $lastFourChars = '';
113
        /**
114
         * Escape flag as specified by the HTML5 specification: "used to
115
         * control the behavior of the tokeniser. It is either true or
116
         * false, and initially must be set to the false state."
117
         */
118
        $escape = false;
119
        //echo "\n\n";
120
        while($state !== null) {
121
            
122
            /*echo $state . ' ';
123
            switch ($this->content_model) {
124
                case self::PCDATA: echo 'PCDATA'; break;
125
                case self::RCDATA: echo 'RCDATA'; break;
126
                case self::CDATA: echo 'CDATA'; break;
127
                case self::PLAINTEXT: echo 'PLAINTEXT'; break;
128
            }
129
            if ($escape) echo " escape";
130
            echo "\n";*/
131
            
132
            switch($state) {
133
                case 'data':
134
135
                    /* Consume the next input character */
136
                    $char = $this->stream->char();
137
                    $lastFourChars .= $char;
138
                    if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
139
140
                    // see below for meaning
141
                    $hyp_cond = 
142
                        !$escape &&
143
                        (
144
                            $this->content_model === self::RCDATA ||
145
                            $this->content_model === self::CDATA
146
                        );
147
                    $amp_cond =
148
                        !$escape &&
149
                        (
150
                            $this->content_model === self::PCDATA ||
151
                            $this->content_model === self::RCDATA
152
                        );
153
                    $lt_cond =
154
                        $this->content_model === self::PCDATA ||
155
                        (
156
                            (
157
                                $this->content_model === self::RCDATA ||
158
                                $this->content_model === self::CDATA
159
                             ) &&
160
                             !$escape
161
                        );
162
                    $gt_cond = 
163
                        $escape &&
164
                        (
165
                            $this->content_model === self::RCDATA ||
166
                            $this->content_model === self::CDATA
167
                        );
168
169
                    if($char === '&' && $amp_cond) {
170
                        /* U+0026 AMPERSAND (&)
171
                        When the content model flag is set to one of the PCDATA or RCDATA
172
                        states and the escape flag is false: switch to the
173
                        character reference data state. Otherwise: treat it as per
174
                        the "anything else" entry below. */
175
                        $state = 'character reference data';
176
177 View Code Duplication
                    } elseif(
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
178
                        $char === '-' &&
179
                        $hyp_cond &&
180
                        $lastFourChars === '<!--'
181
                    ) {
182
                        /*
183
                        U+002D HYPHEN-MINUS (-)
184
                        If the content model flag is set to either the RCDATA state or
185
                        the CDATA state, and the escape flag is false, and there are at
186
                        least three characters before this one in the input stream, and the
187
                        last four characters in the input stream, including this one, are
188
                        U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
189
                        and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
190
                        $escape = true;
191
192
                        /* In any case, emit the input character as a character token. Stay
193
                        in the data state. */
194
                        $this->emitToken(array(
195
                            'type' => self::CHARACTER,
196
                            'data' => '-'
197
                        ));
198
                        // We do the "any case" part as part of "anything else".
199
200
                    /* U+003C LESS-THAN SIGN (<) */
201
                    } elseif($char === '<' && $lt_cond) {
202
                        /* When the content model flag is set to the PCDATA state: switch
203
                        to the tag open state.
204
205
                        When the content model flag is set to either the RCDATA state or
206
                        the CDATA state and the escape flag is false: switch to the tag
207
                        open state.
208
209
                        Otherwise: treat it as per the "anything else" entry below. */
210
                        $state = 'tag open';
211
212
                    /* U+003E GREATER-THAN SIGN (>) */
213 View Code Duplication
                    } elseif(
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
214
                        $char === '>' &&
215
                        $gt_cond &&
216
                        substr($lastFourChars, 1) === '-->'
217
                    ) {
218
                        /* If the content model flag is set to either the RCDATA state or
219
                        the CDATA state, and the escape flag is true, and the last three
220
                        characters in the input stream including this one are U+002D
221
                        HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
222
                        set the escape flag to false. */
223
                        $escape = false;
224
225
                        /* In any case, emit the input character as a character token.
226
                        Stay in the data state. */
227
                        $this->emitToken(array(
228
                            'type' => self::CHARACTER,
229
                            'data' => '>'
230
                        ));
231
                        // We do the "any case" part as part of "anything else".
232
233
                    } elseif($char === false) {
234
                        /* EOF
235
                        Emit an end-of-file token. */
236
                        $state = null;
237
                        $this->tree->emitToken(array(
238
                            'type' => self::EOF
239
                        ));
240
                    
241
                    } elseif($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
242
                        // Directly after emitting a token you switch back to the "data
243
                        // state". At that point spaceCharacters are important so they are
244
                        // emitted separately.
245
                        $chars = $this->stream->charsWhile(self::WHITESPACE);
246
                        $this->emitToken(array(
247
                            'type' => self::SPACECHARACTER,
248
                            'data' => $char . $chars
249
                        ));
250
                        $lastFourChars .= $chars;
251
                        if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
252
253
                    } else {
254
                        /* Anything else
255
                        THIS IS AN OPTIMIZATION: Get as many character that
256
                        otherwise would also be treated as a character token and emit it
257
                        as a single character token. Stay in the data state. */
258
                        
259
                        $mask = '';
260
                        if ($hyp_cond) $mask .= '-';
261
                        if ($amp_cond) $mask .= '&';
262
                        if ($lt_cond)  $mask .= '<';
263
                        if ($gt_cond)  $mask .= '>';
264
265
                        if ($mask === '') {
266
                            $chars = $this->stream->remainingChars();
267
                        } else {
268
                            $chars = $this->stream->charsUntil($mask);
269
                        }
270
271
                        $this->emitToken(array(
272
                            'type' => self::CHARACTER,
273
                            'data' => $char . $chars
274
                        ));
275
276
                        $lastFourChars .= $chars;
277
                        if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4);
278
279
                        $state = 'data';
280
                    }
281
                break;
282
283
                case 'character reference data':
284
                    /* (This cannot happen if the content model flag
285
                    is set to the CDATA state.) */
286
287
                    /* Attempt to consume a character reference, with no
288
                    additional allowed character. */
289
                    $entity = $this->consumeCharacterReference();
290
291
                    /* If nothing is returned, emit a U+0026 AMPERSAND
292
                    character token. Otherwise, emit the character token that
293
                    was returned. */
294
                    // This is all done when consuming the character reference.
295
                    $this->emitToken(array(
296
                        'type' => self::CHARACTER,
297
                        'data' => $entity
298
                    ));
299
300
                    /* Finally, switch to the data state. */
301
                    $state = 'data';
302
                break;
303
304
                case 'tag open':
305
                    $char = $this->stream->char();
306
307
                    switch($this->content_model) {
308
                        case self::RCDATA:
309
                        case self::CDATA:
310
                            /* Consume the next input character. If it is a
311
                            U+002F SOLIDUS (/) character, switch to the close
312
                            tag open state. Otherwise, emit a U+003C LESS-THAN
313
                            SIGN character token and reconsume the current input
314
                            character in the data state. */
315
                            // We consumed above.
316
317
                            if($char === '/') {
318
                                $state = 'close tag open';
319
320
                            } else {
321
                                $this->emitToken(array(
322
                                    'type' => self::CHARACTER,
323
                                    'data' => '<'
324
                                ));
325
326
                                $this->stream->unget();
327
328
                                $state = 'data';
329
                            }
330
                        break;
331
332
                        case self::PCDATA:
333
                            /* If the content model flag is set to the PCDATA state
334
                            Consume the next input character: */
335
                            // We consumed above.
336
337
                            if($char === '!') {
338
                                /* U+0021 EXCLAMATION MARK (!)
339
                                Switch to the markup declaration open state. */
340
                                $state = 'markup declaration open';
341
342
                            } elseif($char === '/') {
343
                                /* U+002F SOLIDUS (/)
344
                                Switch to the close tag open state. */
345
                                $state = 'close tag open';
346
347
                            } elseif('A' <= $char && $char <= 'Z') {
348
                                /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
349
                                Create a new start tag token, set its tag name to the lowercase
350
                                version of the input character (add 0x0020 to the character's code
351
                                point), then switch to the tag name state. (Don't emit the token
352
                                yet; further details will be filled in before it is emitted.) */
353
                                $this->token = array(
354
                                    'name'  => strtolower($char),
355
                                    'type'  => self::STARTTAG,
356
                                    'attr'  => array()
357
                                );
358
359
                                $state = 'tag name';
360
361
                            } elseif('a' <= $char && $char <= 'z') {
362
                                /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
363
                                Create a new start tag token, set its tag name to the input
364
                                character, then switch to the tag name state. (Don't emit
365
                                the token yet; further details will be filled in before it
366
                                is emitted.) */
367
                                $this->token = array(
368
                                    'name'  => $char,
369
                                    'type'  => self::STARTTAG,
370
                                    'attr'  => array()
371
                                );
372
373
                                $state = 'tag name';
374
375
                            } elseif($char === '>') {
376
                                /* U+003E GREATER-THAN SIGN (>)
377
                                Parse error. Emit a U+003C LESS-THAN SIGN character token and a
378
                                U+003E GREATER-THAN SIGN character token. Switch to the data state. */
379
                                $this->emitToken(array(
380
                                    'type' => self::PARSEERROR,
381
                                    'data' => 'expected-tag-name-but-got-right-bracket'
382
                                ));
383
                                $this->emitToken(array(
384
                                    'type' => self::CHARACTER,
385
                                    'data' => '<>'
386
                                ));
387
388
                                $state = 'data';
389
390
                            } elseif($char === '?') {
391
                                /* U+003F QUESTION MARK (?)
392
                                Parse error. Switch to the bogus comment state. */
393
                                $this->emitToken(array(
394
                                    'type' => self::PARSEERROR,
395
                                    'data' => 'expected-tag-name-but-got-question-mark'
396
                                ));
397
                                $this->token = array(
398
                                    'data' => '?',
399
                                    'type' => self::COMMENT
400
                                );
401
                                $state = 'bogus comment';
402
403 View Code Duplication
                            } else {
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
404
                                /* Anything else
405
                                Parse error. Emit a U+003C LESS-THAN SIGN character token and
406
                                reconsume the current input character in the data state. */
407
                                $this->emitToken(array(
408
                                    'type' => self::PARSEERROR,
409
                                    'data' => 'expected-tag-name'
410
                                ));
411
                                $this->emitToken(array(
412
                                    'type' => self::CHARACTER,
413
                                    'data' => '<'
414
                                ));
415
416
                                $state = 'data';
417
                                $this->stream->unget();
418
                            }
419
                        break;
420
                    }
421
                break;
422
423
                case 'close tag open':
424
                    if (
425
                        $this->content_model === self::RCDATA ||
426
                        $this->content_model === self::CDATA
427
                    ) {
428
                        /* If the content model flag is set to the RCDATA or CDATA
429
                        states... */
430
                        $name = strtolower($this->stream->charsWhile(self::ALPHA));
431
                        $following = $this->stream->char();
432
                        $this->stream->unget();
433
                        if (
434
                            !$this->token ||
435
                            $this->token['name'] !== $name ||
436
                            $this->token['name'] === $name && !in_array($following, array("\x09", "\x0A", "\x0C", "\x20", "\x3E", "\x2F", false))
437
                        ) {
438
                            /* if no start tag token has ever been emitted by this instance
439
                            of the tokenizer (fragment case), or, if the next few
440
                            characters do not match the tag name of the last start tag
441
                            token emitted (compared in an ASCII case-insensitive manner),
442
                            or if they do but they are not immediately followed by one of
443
                            the following characters:
444
445
                                * U+0009 CHARACTER TABULATION
446
                                * U+000A LINE FEED (LF)
447
                                * U+000C FORM FEED (FF)
448
                                * U+0020 SPACE
449
                                * U+003E GREATER-THAN SIGN (>)
450
                                * U+002F SOLIDUS (/)
451
                                * EOF
452
453
                            ...then emit a U+003C LESS-THAN SIGN character token, a
454
                            U+002F SOLIDUS character token, and switch to the data
455
                            state to process the next input character. */
456
                            // XXX: Probably ought to replace in_array with $following === x ||...
457
458
                            // We also need to emit $name now we've consumed that, as we
459
                            // know it'll just be emitted as a character token.
460
                            $this->emitToken(array(
461
                                'type' => self::CHARACTER,
462
                                'data' => '</' . $name
463
                            ));
464
465
                            $state = 'data';
466
                        } else {
467
                            // This matches what would happen if we actually did the
468
                            // otherwise below (but we can't because we've consumed too
469
                            // much).
470
471
                            // Start the end tag token with the name we already have.
472
                            $this->token = array(
473
                                'name'  => $name,
474
                                'type'  => self::ENDTAG
475
                            );
476
477
                            // Change to tag name state.
478
                            $state = 'tag name';
479
                        }
480
                    } elseif ($this->content_model === self::PCDATA) {
481
                        /* Otherwise, if the content model flag is set to the PCDATA
482
                        state [...]: */
483
                        $char = $this->stream->char();
484
485
                        if ('A' <= $char && $char <= 'Z') {
486
                            /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
487
                            Create a new end tag token, set its tag name to the lowercase version
488
                            of the input character (add 0x0020 to the character's code point), then
489
                            switch to the tag name state. (Don't emit the token yet; further details
490
                            will be filled in before it is emitted.) */
491
                            $this->token = array(
492
                                'name'  => strtolower($char),
493
                                'type'  => self::ENDTAG
494
                            );
495
496
                            $state = 'tag name';
497
498
                        } elseif ('a' <= $char && $char <= 'z') {
499
                            /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
500
                            Create a new end tag token, set its tag name to the
501
                            input character, then switch to the tag name state.
502
                            (Don't emit the token yet; further details will be
503
                            filled in before it is emitted.) */
504
                            $this->token = array(
505
                                'name'  => $char,
506
                                'type'  => self::ENDTAG
507
                            );
508
509
                            $state = 'tag name';
510
511
                        } elseif($char === '>') {
512
                            /* U+003E GREATER-THAN SIGN (>)
513
                            Parse error. Switch to the data state. */
514
                            $this->emitToken(array(
515
                                'type' => self::PARSEERROR,
516
                                'data' => 'expected-closing-tag-but-got-right-bracket'
517
                            ));
518
                            $state = 'data';
519
520 View Code Duplication
                        } elseif($char === false) {
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
521
                            /* EOF
522
                            Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
523
                            SOLIDUS character token. Reconsume the EOF character in the data state. */
524
                            $this->emitToken(array(
525
                                'type' => self::PARSEERROR,
526
                                'data' => 'expected-closing-tag-but-got-eof'
527
                            ));
528
                            $this->emitToken(array(
529
                                'type' => self::CHARACTER,
530
                                'data' => '</'
531
                            ));
532
533
                            $this->stream->unget();
534
                            $state = 'data';
535
536
                        } else {
537
                            /* Parse error. Switch to the bogus comment state. */
538
                            $this->emitToken(array(
539
                                'type' => self::PARSEERROR,
540
                                'data' => 'expected-closing-tag-but-got-char'
541
                            ));
542
                            $this->token = array(
543
                                'data' => $char,
544
                                'type' => self::COMMENT
545
                            );
546
                            $state = 'bogus comment';
547
                        }
548
                    }
549
                break;
550
551
                case 'tag name':
552
                    /* Consume the next input character: */
553
                    $char = $this->stream->char();
554
555
                    if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
556
                        /* U+0009 CHARACTER TABULATION
557
                        U+000A LINE FEED (LF)
558
                        U+000C FORM FEED (FF)
559
                        U+0020 SPACE
560
                        Switch to the before attribute name state. */
561
                        $state = 'before attribute name';
562
563
                    } elseif($char === '/') {
564
                        /* U+002F SOLIDUS (/)
565
                        Switch to the self-closing start tag state. */
566
                        $state = 'self-closing start tag';
567
568
                    } elseif($char === '>') {
569
                        /* U+003E GREATER-THAN SIGN (>)
570
                        Emit the current tag token. Switch to the data state. */
571
                        $this->emitToken($this->token);
572
                        $state = 'data';
573
574
                    } elseif('A' <= $char && $char <= 'Z') {
575
                        /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
576
                        Append the lowercase version of the current input
577
                        character (add 0x0020 to the character's code point) to
578
                        the current tag token's tag name. Stay in the tag name state. */
579
                        $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
580
581
                        $this->token['name'] .= strtolower($char . $chars);
582
                        $state = 'tag name';
583
584
                    } elseif($char === false) {
585
                        /* EOF
586
                        Parse error. Reconsume the EOF character in the data state. */
587
                        $this->emitToken(array(
588
                            'type' => self::PARSEERROR,
589
                            'data' => 'eof-in-tag-name'
590
                        ));
591
592
                        $this->stream->unget();
593
                        $state = 'data';
594
595
                    } else {
596
                        /* Anything else
597
                        Append the current input character to the current tag token's tag name.
598
                        Stay in the tag name state. */
599
                        $chars = $this->stream->charsUntil("\t\n\x0C />" . self::UPPER_ALPHA);
600
601
                        $this->token['name'] .= $char . $chars;
602
                        $state = 'tag name';
603
                    }
604
                break;
605
606 View Code Duplication
                case 'before attribute name':
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
607
                    /* Consume the next input character: */
608
                    $char = $this->stream->char();
609
610
                    // this conditional is optimized, check bottom
611
                    if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
612
                        /* U+0009 CHARACTER TABULATION
613
                        U+000A LINE FEED (LF)
614
                        U+000C FORM FEED (FF)
615
                        U+0020 SPACE
616
                        Stay in the before attribute name state. */
617
                        $state = 'before attribute name';
618
619
                    } elseif($char === '/') {
620
                        /* U+002F SOLIDUS (/)
621
                        Switch to the self-closing start tag state. */
622
                        $state = 'self-closing start tag';
623
624
                    } elseif($char === '>') {
625
                        /* U+003E GREATER-THAN SIGN (>)
626
                        Emit the current tag token. Switch to the data state. */
627
                        $this->emitToken($this->token);
628
                        $state = 'data';
629
630
                    } elseif('A' <= $char && $char <= 'Z') {
631
                        /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
632
                        Start a new attribute in the current tag token. Set that
633
                        attribute's name to the lowercase version of the current
634
                        input character (add 0x0020 to the character's code
635
                        point), and its value to the empty string. Switch to the
636
                        attribute name state.*/
637
                        $this->token['attr'][] = array(
638
                            'name'  => strtolower($char),
639
                            'value' => ''
640
                        );
641
642
                        $state = 'attribute name';
643
644
                    } elseif($char === false) {
645
                        /* EOF
646
                        Parse error. Reconsume the EOF character in the data state. */
647
                        $this->emitToken(array(
648
                            'type' => self::PARSEERROR,
649
                            'data' => 'expected-attribute-name-but-got-eof'
650
                        ));
651
652
                        $this->stream->unget();
653
                        $state = 'data';
654
655
                    } else {
656
                        /* U+0022 QUOTATION MARK (")
657
                           U+0027 APOSTROPHE (')
658
                           U+003C LESS-THAN SIGN (<)
659
                           U+003D EQUALS SIGN (=)
660
                        Parse error. Treat it as per the "anything else" entry
661
                        below. */
662
                        if($char === '"' || $char === "'" || $char === '<' || $char === '=') {
663
                            $this->emitToken(array(
664
                                'type' => self::PARSEERROR,
665
                                'data' => 'invalid-character-in-attribute-name'
666
                            ));
667
                        }
668
669
                        /* Anything else
670
                        Start a new attribute in the current tag token. Set that attribute's
671
                        name to the current input character, and its value to the empty string.
672
                        Switch to the attribute name state. */
673
                        $this->token['attr'][] = array(
674
                            'name'  => $char,
675
                            'value' => ''
676
                        );
677
678
                        $state = 'attribute name';
679
                    }
680
                break;
681
682
                case 'attribute name':
683
                    // Consume the next input character:
684
                    $char = $this->stream->char();
685
686
                    // this conditional is optimized, check bottom
687
                    if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
688
                        /* U+0009 CHARACTER TABULATION
689
                        U+000A LINE FEED (LF)
690
                        U+000C FORM FEED (FF)
691
                        U+0020 SPACE
692
                        Switch to the after attribute name state. */
693
                        $state = 'after attribute name';
694
695
                    } elseif($char === '/') {
696
                        /* U+002F SOLIDUS (/)
697
                        Switch to the self-closing start tag state. */
698
                        $state = 'self-closing start tag';
699
700
                    } elseif($char === '=') {
701
                        /* U+003D EQUALS SIGN (=)
702
                        Switch to the before attribute value state. */
703
                        $state = 'before attribute value';
704
705
                    } elseif($char === '>') {
706
                        /* U+003E GREATER-THAN SIGN (>)
707
                        Emit the current tag token. Switch to the data state. */
708
                        $this->emitToken($this->token);
709
                        $state = 'data';
710
711
                    } elseif('A' <= $char && $char <= 'Z') {
712
                        /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
713
                        Append the lowercase version of the current input
714
                        character (add 0x0020 to the character's code point) to
715
                        the current attribute's name. Stay in the attribute name
716
                        state. */
717
                        $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
718
719
                        $last = count($this->token['attr']) - 1;
720
                        $this->token['attr'][$last]['name'] .= strtolower($char . $chars);
721
722
                        $state = 'attribute name';
723
724
                    } elseif($char === false) {
725
                        /* EOF
726
                        Parse error. Reconsume the EOF character in the data state. */
727
                        $this->emitToken(array(
728
                            'type' => self::PARSEERROR,
729
                            'data' => 'eof-in-attribute-name'
730
                        ));
731
732
                        $this->stream->unget();
733
                        $state = 'data';
734
735 View Code Duplication
                    } else {
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
736
                        /* U+0022 QUOTATION MARK (")
737
                           U+0027 APOSTROPHE (')
738
                           U+003C LESS-THAN SIGN (<)
739
                        Parse error. Treat it as per the "anything else"
740
                        entry below. */
741
                        if($char === '"' || $char === "'" || $char === '<') {
742
                            $this->emitToken(array(
743
                                'type' => self::PARSEERROR,
744
                                'data' => 'invalid-character-in-attribute-name'
745
                            ));
746
                        }
747
748
                        /* Anything else
749
                        Append the current input character to the current attribute's name.
750
                        Stay in the attribute name state. */
751
                        $chars = $this->stream->charsUntil("\t\n\x0C /=>\"'" . self::UPPER_ALPHA);
752
753
                        $last = count($this->token['attr']) - 1;
754
                        $this->token['attr'][$last]['name'] .= $char . $chars;
755
756
                        $state = 'attribute name';
757
                    }
758
759
                    /* When the user agent leaves the attribute name state
760
                    (and before emitting the tag token, if appropriate), the
761
                    complete attribute's name must be compared to the other
762
                    attributes on the same token; if there is already an
763
                    attribute on the token with the exact same name, then this
764
                    is a parse error and the new attribute must be dropped, along
765
                    with the value that gets associated with it (if any). */
766
                    // this might be implemented in the emitToken method
767
                break;
768
769 View Code Duplication
                case 'after attribute name':
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
770
                    // Consume the next input character:
771
                    $char = $this->stream->char();
772
773
                    // this is an optimized conditional, check the bottom
774
                    if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
775
                        /* U+0009 CHARACTER TABULATION
776
                        U+000A LINE FEED (LF)
777
                        U+000C FORM FEED (FF)
778
                        U+0020 SPACE
779
                        Stay in the after attribute name state. */
780
                        $state = 'after attribute name';
781
782
                    } elseif($char === '/') {
783
                        /* U+002F SOLIDUS (/)
784
                        Switch to the self-closing start tag state. */
785
                        $state = 'self-closing start tag';
786
787
                    } elseif($char === '=') {
788
                        /* U+003D EQUALS SIGN (=)
789
                        Switch to the before attribute value state. */
790
                        $state = 'before attribute value';
791
792
                    } elseif($char === '>') {
793
                        /* U+003E GREATER-THAN SIGN (>)
794
                        Emit the current tag token. Switch to the data state. */
795
                        $this->emitToken($this->token);
796
                        $state = 'data';
797
798
                    } elseif('A' <= $char && $char <= 'Z') {
799
                        /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
800
                        Start a new attribute in the current tag token. Set that
801
                        attribute's name to the lowercase version of the current
802
                        input character (add 0x0020 to the character's code
803
                        point), and its value to the empty string. Switch to the
804
                        attribute name state. */
805
                        $this->token['attr'][] = array(
806
                            'name'  => strtolower($char),
807
                            'value' => ''
808
                        );
809
810
                        $state = 'attribute name';
811
812
                    } elseif($char === false) {
813
                        /* EOF
814
                        Parse error. Reconsume the EOF character in the data state. */
815
                        $this->emitToken(array(
816
                            'type' => self::PARSEERROR,
817
                            'data' => 'expected-end-of-tag-but-got-eof'
818
                        ));
819
820
                        $this->stream->unget();
821
                        $state = 'data';
822
823
                    } else {
824
                        /* U+0022 QUOTATION MARK (")
825
                           U+0027 APOSTROPHE (')
826
                           U+003C LESS-THAN SIGN(<)
827
                        Parse error. Treat it as per the "anything else"
828
                        entry below. */
829
                        if($char === '"' || $char === "'" || $char === "<") {
830
                            $this->emitToken(array(
831
                                'type' => self::PARSEERROR,
832
                                'data' => 'invalid-character-after-attribute-name'
833
                            ));
834
                        }
835
836
                        /* Anything else
837
                        Start a new attribute in the current tag token. Set that attribute's
838
                        name to the current input character, and its value to the empty string.
839
                        Switch to the attribute name state. */
840
                        $this->token['attr'][] = array(
841
                            'name'  => $char,
842
                            'value' => ''
843
                        );
844
845
                        $state = 'attribute name';
846
                    }
847
                break;
848
849
                case 'before attribute value':
850
                    // Consume the next input character:
851
                    $char = $this->stream->char();
852
853
                    // this is an optimized conditional
854
                    if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
855
                        /* U+0009 CHARACTER TABULATION
856
                        U+000A LINE FEED (LF)
857
                        U+000C FORM FEED (FF)
858
                        U+0020 SPACE
859
                        Stay in the before attribute value state. */
860
                        $state = 'before attribute value';
861
862
                    } elseif($char === '"') {
863
                        /* U+0022 QUOTATION MARK (")
864
                        Switch to the attribute value (double-quoted) state. */
865
                        $state = 'attribute value (double-quoted)';
866
867
                    } elseif($char === '&') {
868
                        /* U+0026 AMPERSAND (&)
869
                        Switch to the attribute value (unquoted) state and reconsume
870
                        this input character. */
871
                        $this->stream->unget();
872
                        $state = 'attribute value (unquoted)';
873
874
                    } elseif($char === '\'') {
875
                        /* U+0027 APOSTROPHE (')
876
                        Switch to the attribute value (single-quoted) state. */
877
                        $state = 'attribute value (single-quoted)';
878
879
                    } elseif($char === '>') {
880
                        /* U+003E GREATER-THAN SIGN (>)
881
                        Parse error. Emit the current tag token. Switch to the data state. */
882
                        $this->emitToken(array(
883
                            'type' => self::PARSEERROR,
884
                            'data' => 'expected-attribute-value-but-got-right-bracket'
885
                        ));
886
                        $this->emitToken($this->token);
887
                        $state = 'data';
888
889
                    } elseif($char === false) {
890
                        /* EOF
891
                        Parse error. Reconsume the EOF character in the data state. */
892
                        $this->emitToken(array(
893
                            'type' => self::PARSEERROR,
894
                            'data' => 'expected-attribute-value-but-got-eof'
895
                        ));
896
                        $this->stream->unget();
897
                        $state = 'data';
898
899
                    } else {
900
                        /* U+003D EQUALS SIGN (=)
901
                         * U+003C LESS-THAN SIGN (<)
902
                        Parse error. Treat it as per the "anything else" entry below. */
903
                        if($char === '=' || $char === '<') {
904
                            $this->emitToken(array(
905
                                'type' => self::PARSEERROR,
906
                                'data' => 'equals-in-unquoted-attribute-value'
907
                            ));
908
                        }
909
910
                        /* Anything else
911
                        Append the current input character to the current attribute's value.
912
                        Switch to the attribute value (unquoted) state. */
913
                        $last = count($this->token['attr']) - 1;
914
                        $this->token['attr'][$last]['value'] .= $char;
915
916
                        $state = 'attribute value (unquoted)';
917
                    }
918
                break;
919
920 View Code Duplication
                case 'attribute value (double-quoted)':
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
921
                    // Consume the next input character:
922
                    $char = $this->stream->char();
923
924
                    if($char === '"') {
925
                        /* U+0022 QUOTATION MARK (")
926
                        Switch to the after attribute value (quoted) state. */
927
                        $state = 'after attribute value (quoted)';
928
929
                    } elseif($char === '&') {
930
                        /* U+0026 AMPERSAND (&)
931
                        Switch to the character reference in attribute value
932
                        state, with the additional allowed character
933
                        being U+0022 QUOTATION MARK ("). */
934
                        $this->characterReferenceInAttributeValue('"');
935
936
                    } elseif($char === false) {
937
                        /* EOF
938
                        Parse error. Reconsume the EOF character in the data state. */
939
                        $this->emitToken(array(
940
                            'type' => self::PARSEERROR,
941
                            'data' => 'eof-in-attribute-value-double-quote'
942
                        ));
943
944
                        $this->stream->unget();
945
                        $state = 'data';
946
947
                    } else {
948
                        /* Anything else
949
                        Append the current input character to the current attribute's value.
950
                        Stay in the attribute value (double-quoted) state. */
951
                        $chars = $this->stream->charsUntil('"&');
952
953
                        $last = count($this->token['attr']) - 1;
954
                        $this->token['attr'][$last]['value'] .= $char . $chars;
955
956
                        $state = 'attribute value (double-quoted)';
957
                    }
958
                break;
959
960 View Code Duplication
                case 'attribute value (single-quoted)':
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
961
                    // Consume the next input character:
962
                    $char = $this->stream->char();
963
964
                    if($char === "'") {
965
                        /* U+0022 QUOTATION MARK (')
966
                        Switch to the after attribute value state. */
967
                        $state = 'after attribute value (quoted)';
968
969
                    } elseif($char === '&') {
970
                        /* U+0026 AMPERSAND (&)
971
                        Switch to the entity in attribute value state. */
972
                        $this->characterReferenceInAttributeValue("'");
973
974
                    } elseif($char === false) {
975
                        /* EOF
976
                        Parse error. Reconsume the EOF character in the data state. */
977
                        $this->emitToken(array(
978
                            'type' => self::PARSEERROR,
979
                            'data' => 'eof-in-attribute-value-single-quote'
980
                        ));
981
982
                        $this->stream->unget();
983
                        $state = 'data';
984
985
                    } else {
986
                        /* Anything else
987
                        Append the current input character to the current attribute's value.
988
                        Stay in the attribute value (single-quoted) state. */
989
                        $chars = $this->stream->charsUntil("'&");
990
991
                        $last = count($this->token['attr']) - 1;
992
                        $this->token['attr'][$last]['value'] .= $char . $chars;
993
994
                        $state = 'attribute value (single-quoted)';
995
                    }
996
                break;
997
998
                case 'attribute value (unquoted)':
999
                    // Consume the next input character:
1000
                    $char = $this->stream->char();
1001
1002
                    if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1003
                        /* U+0009 CHARACTER TABULATION
1004
                        U+000A LINE FEED (LF)
1005
                        U+000C FORM FEED (FF)
1006
                        U+0020 SPACE
1007
                        Switch to the before attribute name state. */
1008
                        $state = 'before attribute name';
1009
1010
                    } elseif($char === '&') {
1011
                        /* U+0026 AMPERSAND (&)
1012
                        Switch to the entity in attribute value state, with the 
1013
                        additional allowed character  being U+003E 
1014
                        GREATER-THAN SIGN (>). */
1015
                        $this->characterReferenceInAttributeValue('>');
1016
1017
                    } elseif($char === '>') {
1018
                        /* U+003E GREATER-THAN SIGN (>)
1019
                        Emit the current tag token. Switch to the data state. */
1020
                        $this->emitToken($this->token);
1021
                        $state = 'data';
1022
1023
                    } elseif ($char === false) {
1024
                        /* EOF
1025
                        Parse error. Reconsume the EOF character in the data state. */
1026
                        $this->emitToken(array(
1027
                            'type' => self::PARSEERROR,
1028
                            'data' => 'eof-in-attribute-value-no-quotes'
1029
                        ));
1030
                        $this->stream->unget();
1031
                        $state = 'data';
1032
1033 View Code Duplication
                    } else {
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1034
                        /* U+0022 QUOTATION MARK (")
1035
                           U+0027 APOSTROPHE (')
1036
                           U+003C LESS-THAN SIGN (<)
1037
                           U+003D EQUALS SIGN (=)
1038
                        Parse error. Treat it as per the "anything else"
1039
                        entry below. */
1040
                        if($char === '"' || $char === "'" || $char === '=' || $char == '<') {
1041
                            $this->emitToken(array(
1042
                                'type' => self::PARSEERROR,
1043
                                'data' => 'unexpected-character-in-unquoted-attribute-value'
1044
                            ));
1045
                        }
1046
1047
                        /* Anything else
1048
                        Append the current input character to the current attribute's value.
1049
                        Stay in the attribute value (unquoted) state. */
1050
                        $chars = $this->stream->charsUntil("\t\n\x0c &>\"'=");
1051
1052
                        $last = count($this->token['attr']) - 1;
1053
                        $this->token['attr'][$last]['value'] .= $char . $chars;
1054
1055
                        $state = 'attribute value (unquoted)';
1056
                    }
1057
                break;
1058
1059
                case 'after attribute value (quoted)':
1060
                    /* Consume the next input character: */
1061
                    $char = $this->stream->char();
1062
1063
                    if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1064
                        /* U+0009 CHARACTER TABULATION
1065
                           U+000A LINE FEED (LF)
1066
                           U+000C FORM FEED (FF)
1067
                           U+0020 SPACE
1068
                        Switch to the before attribute name state. */
1069
                        $state = 'before attribute name';
1070
1071
                    } elseif ($char === '/') {
1072
                        /* U+002F SOLIDUS (/)
1073
                        Switch to the self-closing start tag state. */
1074
                        $state = 'self-closing start tag';
1075
1076
                    } elseif ($char === '>') {
1077
                        /* U+003E GREATER-THAN SIGN (>)
1078
                        Emit the current tag token. Switch to the data state. */
1079
                        $this->emitToken($this->token);
1080
                        $state = 'data';
1081
1082
                    } elseif ($char === false) {
1083
                        /* EOF
1084
                        Parse error. Reconsume the EOF character in the data state. */
1085
                        $this->emitToken(array(
1086
                            'type' => self::PARSEERROR,
1087
                            'data' => 'unexpected-EOF-after-attribute-value'
1088
                        ));
1089
                        $this->stream->unget();
1090
                        $state = 'data';
1091
1092
                    } else {
1093
                        /* Anything else
1094
                        Parse error. Reconsume the character in the before attribute
1095
                        name state. */
1096
                        $this->emitToken(array(
1097
                            'type' => self::PARSEERROR,
1098
                            'data' => 'unexpected-character-after-attribute-value'
1099
                        ));
1100
                        $this->stream->unget();
1101
                        $state = 'before attribute name';
1102
                    }
1103
                break;
1104
1105
                case 'self-closing start tag':
1106
                    /* Consume the next input character: */
1107
                    $char = $this->stream->char();
1108
1109
                    if ($char === '>') {
1110
                        /* U+003E GREATER-THAN SIGN (>)
1111
                        Set the self-closing flag of the current tag token.
1112
                        Emit the current tag token. Switch to the data state. */
1113
                        // not sure if this is the name we want
1114
                        $this->token['self-closing'] = true;
1115
                        $this->emitToken($this->token);
1116
                        $state = 'data';
1117
1118
                    } elseif ($char === false) {
1119
                        /* EOF
1120
                        Parse error. Reconsume the EOF character in the data state. */
1121
                        $this->emitToken(array(
1122
                            'type' => self::PARSEERROR,
1123
                            'data' => 'unexpected-eof-after-self-closing'
1124
                        ));
1125
                        $this->stream->unget();
1126
                        $state = 'data';
1127
1128
                    } else {
1129
                        /* Anything else
1130
                        Parse error. Reconsume the character in the before attribute name state. */
1131
                        $this->emitToken(array(
1132
                            'type' => self::PARSEERROR,
1133
                            'data' => 'unexpected-character-after-self-closing'
1134
                        ));
1135
                        $this->stream->unget();
1136
                        $state = 'before attribute name';
1137
                    }
1138
                break;
1139
1140
                case 'bogus comment':
1141
                    /* (This can only happen if the content model flag is set to the PCDATA state.) */
1142
                    /* Consume every character up to the first U+003E GREATER-THAN SIGN
1143
                    character (>) or the end of the file (EOF), whichever comes first. Emit
1144
                    a comment token whose data is the concatenation of all the characters
1145
                    starting from and including the character that caused the state machine
1146
                    to switch into the bogus comment state, up to and including the last
1147
                    consumed character before the U+003E character, if any, or up to the
1148
                    end of the file otherwise. (If the comment was started by the end of
1149
                    the file (EOF), the token is empty.) */
1150
                    $this->token['data'] .= (string) $this->stream->charsUntil('>');
1151
                    $this->stream->char();
1152
1153
                    $this->emitToken($this->token);
1154
1155
                    /* Switch to the data state. */
1156
                    $state = 'data';
1157
                break;
1158
1159
                case 'markup declaration open':
1160
                    // Consume for below
1161
                    $hyphens = $this->stream->charsWhile('-', 2);
1162
                    if ($hyphens === '-') {
1163
                        $this->stream->unget();
1164
                    }
1165
                    if ($hyphens !== '--') {
1166
                        $alpha = $this->stream->charsWhile(self::ALPHA, 7);
1167
                    }
1168
1169
                    /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1170
                    characters, consume those two characters, create a comment token whose
1171
                    data is the empty string, and switch to the comment state. */
1172
                    if($hyphens === '--') {
1173
                        $state = 'comment start';
1174
                        $this->token = array(
1175
                            'data' => '',
1176
                            'type' => self::COMMENT
1177
                        );
1178
1179
                    /* Otherwise if the next seven characters are a case-insensitive match
1180
                    for the word "DOCTYPE", then consume those characters and switch to the
1181
                    DOCTYPE state. */
1182
                    } elseif(strtoupper($alpha) === 'DOCTYPE') {
1183
                        $state = 'DOCTYPE';
1184
1185
                    // XXX not implemented
1186
                    /* Otherwise, if the insertion mode is "in foreign content"
1187
                    and the current node is not an element in the HTML namespace
1188
                    and the next seven characters are an ASCII case-sensitive
1189
                    match for the string "[CDATA[" (the five uppercase letters
1190
                    "CDATA" with a U+005B LEFT SQUARE BRACKET character before
1191
                    and after), then consume those characters and switch to the
1192
                    CDATA section state (which is unrelated to the content model
1193
                    flag's CDATA state). */
1194
1195
                    /* Otherwise, is is a parse error. Switch to the bogus comment state.
1196
                    The next character that is consumed, if any, is the first character
1197
                    that will be in the comment. */
1198
                    } else {
1199
                        $this->emitToken(array(
1200
                            'type' => self::PARSEERROR,
1201
                            'data' => 'expected-dashes-or-doctype'
1202
                        ));
1203
                        $this->token = array(
1204
                            'data' => (string) $alpha,
1205
                            'type' => self::COMMENT
1206
                        );
1207
                        $state = 'bogus comment';
1208
                    }
1209
                break;
1210
1211 View Code Duplication
                case 'comment start':
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1212
                    /* Consume the next input character: */
1213
                    $char = $this->stream->char();
1214
1215
                    if ($char === '-') {
1216
                        /* U+002D HYPHEN-MINUS (-)
1217
                        Switch to the comment start dash state. */
1218
                        $state = 'comment start dash';
1219
                    } elseif ($char === '>') {
1220
                        /* U+003E GREATER-THAN SIGN (>)
1221
                        Parse error. Emit the comment token. Switch to the
1222
                        data state. */
1223
                        $this->emitToken(array(
1224
                            'type' => self::PARSEERROR,
1225
                            'data' => 'incorrect-comment'
1226
                        ));
1227
                        $this->emitToken($this->token);
1228
                        $state = 'data';
1229
                    } elseif ($char === false) {
1230
                        /* EOF
1231
                        Parse error. Emit the comment token. Reconsume the
1232
                        EOF character in the data state. */
1233
                        $this->emitToken(array(
1234
                            'type' => self::PARSEERROR,
1235
                            'data' => 'eof-in-comment'
1236
                        ));
1237
                        $this->emitToken($this->token);
1238
                        $this->stream->unget();
1239
                        $state = 'data';
1240
                    } else {
1241
                        /* Anything else
1242
                        Append the input character to the comment token's
1243
                        data. Switch to the comment state. */
1244
                        $this->token['data'] .= $char;
1245
                        $state = 'comment';
1246
                    }
1247
                break;
1248
1249 View Code Duplication
                case 'comment start dash':
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1250
                    /* Consume the next input character: */
1251
                    $char = $this->stream->char();
1252
                    if ($char === '-') {
1253
                        /* U+002D HYPHEN-MINUS (-)
1254
                        Switch to the comment end state */
1255
                        $state = 'comment end';
1256
                    } elseif ($char === '>') {
1257
                        /* U+003E GREATER-THAN SIGN (>)
1258
                        Parse error. Emit the comment token. Switch to the
1259
                        data state. */
1260
                        $this->emitToken(array(
1261
                            'type' => self::PARSEERROR,
1262
                            'data' => 'incorrect-comment'
1263
                        ));
1264
                        $this->emitToken($this->token);
1265
                        $state = 'data';
1266
                    } elseif ($char === false) {
1267
                        /* Parse error. Emit the comment token. Reconsume the
1268
                        EOF character in the data state. */
1269
                        $this->emitToken(array(
1270
                            'type' => self::PARSEERROR,
1271
                            'data' => 'eof-in-comment'
1272
                        ));
1273
                        $this->emitToken($this->token);
1274
                        $this->stream->unget();
1275
                        $state = 'data';
1276
                    } else {
1277
                        $this->token['data'] .= '-' . $char;
1278
                        $state = 'comment';
1279
                    }
1280
                break;
1281
1282 View Code Duplication
                case 'comment':
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1283
                    /* Consume the next input character: */
1284
                    $char = $this->stream->char();
1285
1286
                    if($char === '-') {
1287
                        /* U+002D HYPHEN-MINUS (-)
1288
                        Switch to the comment end dash state */
1289
                        $state = 'comment end dash';
1290
1291
                    } elseif($char === false) {
1292
                        /* EOF
1293
                        Parse error. Emit the comment token. Reconsume the EOF character
1294
                        in the data state. */
1295
                        $this->emitToken(array(
1296
                            'type' => self::PARSEERROR,
1297
                            'data' => 'eof-in-comment'
1298
                        ));
1299
                        $this->emitToken($this->token);
1300
                        $this->stream->unget();
1301
                        $state = 'data';
1302
1303
                    } else {
1304
                        /* Anything else
1305
                        Append the input character to the comment token's data. Stay in
1306
                        the comment state. */
1307
                        $chars = $this->stream->charsUntil('-');
1308
1309
                        $this->token['data'] .= $char . $chars;
1310
                    }
1311
                break;
1312
1313 View Code Duplication
                case 'comment end dash':
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1314
                    /* Consume the next input character: */
1315
                    $char = $this->stream->char();
1316
1317
                    if($char === '-') {
1318
                        /* U+002D HYPHEN-MINUS (-)
1319
                        Switch to the comment end state  */
1320
                        $state = 'comment end';
1321
1322
                    } elseif($char === false) {
1323
                        /* EOF
1324
                        Parse error. Emit the comment token. Reconsume the EOF character
1325
                        in the data state. */
1326
                        $this->emitToken(array(
1327
                            'type' => self::PARSEERROR,
1328
                            'data' => 'eof-in-comment-end-dash'
1329
                        ));
1330
                        $this->emitToken($this->token);
1331
                        $this->stream->unget();
1332
                        $state = 'data';
1333
1334
                    } else {
1335
                        /* Anything else
1336
                        Append a U+002D HYPHEN-MINUS (-) character and the input
1337
                        character to the comment token's data. Switch to the comment state. */
1338
                        $this->token['data'] .= '-'.$char;
1339
                        $state = 'comment';
1340
                    }
1341
                break;
1342
1343
                case 'comment end':
1344
                    /* Consume the next input character: */
1345
                    $char = $this->stream->char();
1346
1347
                    if($char === '>') {
1348
                        /* U+003E GREATER-THAN SIGN (>)
1349
                        Emit the comment token. Switch to the data state. */
1350
                        $this->emitToken($this->token);
1351
                        $state = 'data';
1352
1353
                    } elseif($char === '-') {
1354
                        /* U+002D HYPHEN-MINUS (-)
1355
                        Parse error. Append a U+002D HYPHEN-MINUS (-) character
1356
                        to the comment token's data. Stay in the comment end
1357
                        state. */
1358
                        $this->emitToken(array(
1359
                            'type' => self::PARSEERROR,
1360
                            'data' => 'unexpected-dash-after-double-dash-in-comment'
1361
                        ));
1362
                        $this->token['data'] .= '-';
1363
1364
                    } elseif($char === "\t" || $char === "\n" || $char === "\x0a" || $char === ' ') {
1365
                        $this->emitToken(array(
1366
                            'type' => self::PARSEERROR,
1367
                            'data' => 'unexpected-space-after-double-dash-in-comment'
1368
                        ));
1369
                        $this->token['data'] .= '--' . $char;
1370
                        $state = 'comment end space';
1371
1372
                    } elseif($char === '!') {
1373
                        $this->emitToken(array(
1374
                            'type' => self::PARSEERROR,
1375
                            'data' => 'unexpected-bang-after-double-dash-in-comment'
1376
                        ));
1377
                        $state = 'comment end bang';
1378
1379
                    } elseif($char === false) {
1380
                        /* EOF
1381
                        Parse error. Emit the comment token. Reconsume the
1382
                        EOF character in the data state. */
1383
                        $this->emitToken(array(
1384
                            'type' => self::PARSEERROR,
1385
                            'data' => 'eof-in-comment-double-dash'
1386
                        ));
1387
                        $this->emitToken($this->token);
1388
                        $this->stream->unget();
1389
                        $state = 'data';
1390
1391
                    } else {
1392
                        /* Anything else
1393
                        Parse error. Append two U+002D HYPHEN-MINUS (-)
1394
                        characters and the input character to the comment token's
1395
                        data. Switch to the comment state. */
1396
                        $this->emitToken(array(
1397
                            'type' => self::PARSEERROR,
1398
                            'data' => 'unexpected-char-in-comment'
1399
                        ));
1400
                        $this->token['data'] .= '--'.$char;
1401
                        $state = 'comment';
1402
                    }
1403
                break;
1404
1405
                case 'comment end bang':
1406
                    $char = $this->stream->char();
1407
                    if ($char === '>') {
1408
                        $this->emitToken($this->token);
1409
                        $state = 'data';
1410
                    } elseif ($char === "-") {
1411
                        $this->token['data'] .= '--!';
1412
                        $state = 'comment end dash';
1413
                    } elseif ($char === false) {
1414
                        $this->emitToken(array(
1415
                            'type' => self::PARSEERROR,
1416
                            'data' => 'eof-in-comment-end-bang'
1417
                        ));
1418
                        $this->emitToken($this->token);
1419
                        $this->stream->unget();
1420
                        $state = 'data';
1421
                    } else {
1422
                        $this->token['data'] .= '--!' . $char;
1423
                        $state = 'comment';
1424
                    }
1425
                break;
1426
1427
                case 'comment end space':
1428
                    $char = $this->stream->char();
1429
                    if ($char === '>') {
1430
                        $this->emitToken($this->token);
1431
                        $state = 'data';
1432
                    } elseif ($char === '-') {
1433
                        $state = 'comment end dash';
1434
                    } elseif ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1435
                        $this->token['data'] .= $char;
1436
                    } elseif ($char === false) {
1437
                        $this->emitToken(array(
1438
                            'type' => self::PARSEERROR,
1439
                            'data' => 'unexpected-eof-in-comment-end-space',
1440
                        ));
1441
                        $this->emitToken($this->token);
1442
                        $this->stream->unget();
1443
                        $state = 'data';
1444
                    } else {
1445
                        $this->token['data'] .= $char;
1446
                        $state = 'comment';
1447
                    }
1448
                break;
1449
1450
                case 'DOCTYPE':
1451
                    /* Consume the next input character: */
1452
                    $char = $this->stream->char();
1453
1454
                    if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1455
                        /* U+0009 CHARACTER TABULATION
1456
                           U+000A LINE FEED (LF)
1457
                           U+000C FORM FEED (FF)
1458
                           U+0020 SPACE
1459
                        Switch to the before DOCTYPE name state. */
1460
                        $state = 'before DOCTYPE name';
1461
                    
1462 View Code Duplication
                    } elseif($char === false) {
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1463
                        /* EOF
1464
                        Parse error. Create a new DOCTYPE token. Set its
1465
                        force-quirks flag to on. Emit the token. Reconsume the
1466
                        EOF character in the data state. */
1467
                        $this->emitToken(array(
1468
                            'type' => self::PARSEERROR,
1469
                            'data' => 'need-space-after-doctype-but-got-eof'
1470
                        ));
1471
                        $this->emitToken(array(
1472
                            'name' => '',
1473
                            'type' => self::DOCTYPE,
1474
                            'force-quirks' => true,
1475
                            'error' => true
1476
                        ));
1477
                        $this->stream->unget();
1478
                        $state = 'data';
1479
1480
                    } else {
1481
                        /* Anything else
1482
                        Parse error. Reconsume the current character in the
1483
                        before DOCTYPE name state. */
1484
                        $this->emitToken(array(
1485
                            'type' => self::PARSEERROR,
1486
                            'data' => 'need-space-after-doctype'
1487
                        ));
1488
                        $this->stream->unget();
1489
                        $state = 'before DOCTYPE name';
1490
                    }
1491
                break;
1492
1493
                case 'before DOCTYPE name':
1494
                    /* Consume the next input character: */
1495
                    $char = $this->stream->char();
1496
1497
                    if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1498
                        /* U+0009 CHARACTER TABULATION
1499
                           U+000A LINE FEED (LF)
1500
                           U+000C FORM FEED (FF)
1501
                           U+0020 SPACE
1502
                        Stay in the before DOCTYPE name state. */
1503
1504 View Code Duplication
                    } elseif($char === '>') {
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1505
                        /* U+003E GREATER-THAN SIGN (>)
1506
                        Parse error. Create a new DOCTYPE token. Set its
1507
                        force-quirks flag to on. Emit the token. Switch to the
1508
                        data state. */
1509
                        $this->emitToken(array(
1510
                            'type' => self::PARSEERROR,
1511
                            'data' => 'expected-doctype-name-but-got-right-bracket'
1512
                        ));
1513
                        $this->emitToken(array(
1514
                            'name' => '',
1515
                            'type' => self::DOCTYPE,
1516
                            'force-quirks' => true,
1517
                            'error' => true
1518
                        ));
1519
1520
                        $state = 'data';
1521
1522
                    } elseif('A' <= $char && $char <= 'Z') {
1523
                        /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1524
                        Create a new DOCTYPE token. Set the token's name to the
1525
                        lowercase version of the input character (add 0x0020 to
1526
                        the character's code point). Switch to the DOCTYPE name
1527
                        state. */
1528
                        $this->token = array(
1529
                            'name' => strtolower($char),
1530
                            'type' => self::DOCTYPE,
1531
                            'error' => true
1532
                        );
1533
1534
                        $state = 'DOCTYPE name';
1535
1536 View Code Duplication
                    } elseif($char === false) {
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1537
                        /* EOF
1538
                        Parse error. Create a new DOCTYPE token. Set its
1539
                        force-quirks flag to on. Emit the token. Reconsume the
1540
                        EOF character in the data state. */
1541
                        $this->emitToken(array(
1542
                            'type' => self::PARSEERROR,
1543
                            'data' => 'expected-doctype-name-but-got-eof'
1544
                        ));
1545
                        $this->emitToken(array(
1546
                            'name' => '',
1547
                            'type' => self::DOCTYPE,
1548
                            'force-quirks' => true,
1549
                            'error' => true
1550
                        ));
1551
1552
                        $this->stream->unget();
1553
                        $state = 'data';
1554
1555
                    } else {
1556
                        /* Anything else
1557
                        Create a new DOCTYPE token. Set the token's name to the
1558
                        current input character. Switch to the DOCTYPE name state. */
1559
                        $this->token = array(
1560
                            'name' => $char,
1561
                            'type' => self::DOCTYPE,
1562
                            'error' => true
1563
                        );
1564
1565
                        $state = 'DOCTYPE name';
1566
                    }
1567
                break;
1568
1569
                case 'DOCTYPE name':
1570
                    /* Consume the next input character: */
1571
                    $char = $this->stream->char();
1572
1573
                    if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1574
                        /* U+0009 CHARACTER TABULATION
1575
                           U+000A LINE FEED (LF)
1576
                           U+000C FORM FEED (FF)
1577
                           U+0020 SPACE
1578
                        Switch to the after DOCTYPE name state. */
1579
                        $state = 'after DOCTYPE name';
1580
1581
                    } elseif($char === '>') {
1582
                        /* U+003E GREATER-THAN SIGN (>)
1583
                        Emit the current DOCTYPE token. Switch to the data state. */
1584
                        $this->emitToken($this->token);
1585
                        $state = 'data';
1586
1587
                    } elseif('A' <= $char && $char <= 'Z') {
1588
                        /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1589
                        Append the lowercase version of the input character
1590
                        (add 0x0020 to the character's code point) to the current
1591
                        DOCTYPE token's name. Stay in the DOCTYPE name state. */
1592
                        $this->token['name'] .= strtolower($char);
1593
1594
                    } elseif($char === false) {
1595
                        /* EOF
1596
                        Parse error. Set the DOCTYPE token's force-quirks flag
1597
                        to on. Emit that DOCTYPE token. Reconsume the EOF
1598
                        character in the data state. */
1599
                        $this->emitToken(array(
1600
                            'type' => self::PARSEERROR,
1601
                            'data' => 'eof-in-doctype-name'
1602
                        ));
1603
                        $this->token['force-quirks'] = true;
1604
                        $this->emitToken($this->token);
1605
                        $this->stream->unget();
1606
                        $state = 'data';
1607
1608
                    } else {
1609
                        /* Anything else
1610
                        Append the current input character to the current
1611
                        DOCTYPE token's name. Stay in the DOCTYPE name state. */
1612
                        $this->token['name'] .= $char;
1613
                    }
1614
1615
                    // XXX this is probably some sort of quirks mode designation,
1616
                    // check tree-builder to be sure. In general 'error' needs
1617
                    // to be specc'ified, this probably means removing it at the end
1618
                    $this->token['error'] = ($this->token['name'] === 'HTML')
1619
                        ? false
1620
                        : true;
1621
                break;
1622
1623
                case 'after DOCTYPE name':
1624
                    /* Consume the next input character: */
1625
                    $char = $this->stream->char();
1626
1627
                    if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1628
                        /* U+0009 CHARACTER TABULATION
1629
                           U+000A LINE FEED (LF)
1630
                           U+000C FORM FEED (FF)
1631
                           U+0020 SPACE
1632
                        Stay in the after DOCTYPE name state. */
1633
1634
                    } elseif($char === '>') {
1635
                        /* U+003E GREATER-THAN SIGN (>)
1636
                        Emit the current DOCTYPE token. Switch to the data state. */
1637
                        $this->emitToken($this->token);
1638
                        $state = 'data';
1639
1640
                    } elseif($char === false) {
1641
                        /* EOF
1642
                        Parse error. Set the DOCTYPE token's force-quirks flag
1643
                        to on. Emit that DOCTYPE token. Reconsume the EOF
1644
                        character in the data state. */
1645
                        $this->emitToken(array(
1646
                            'type' => self::PARSEERROR,
1647
                            'data' => 'eof-in-doctype'
1648
                        ));
1649
                        $this->token['force-quirks'] = true;
1650
                        $this->emitToken($this->token);
1651
                        $this->stream->unget();
1652
                        $state = 'data';
1653
1654
                    } else {
1655
                        /* Anything else */
1656
1657
                        $nextSix = strtoupper($char . $this->stream->charsWhile(self::ALPHA, 5));
1658
                        if ($nextSix === 'PUBLIC') {
1659
                            /* If the next six characters are an ASCII
1660
                            case-insensitive match for the word "PUBLIC", then
1661
                            consume those characters and switch to the before
1662
                            DOCTYPE public identifier state. */
1663
                            $state = 'before DOCTYPE public identifier';
1664
1665
                        } elseif ($nextSix === 'SYSTEM') {
1666
                            /* Otherwise, if the next six characters are an ASCII
1667
                            case-insensitive match for the word "SYSTEM", then
1668
                            consume those characters and switch to the before
1669
                            DOCTYPE system identifier state. */
1670
                            $state = 'before DOCTYPE system identifier';
1671
1672
                        } else {
1673
                            /* Otherwise, this is the parse error. Set the DOCTYPE
1674
                            token's force-quirks flag to on. Switch to the bogus
1675
                            DOCTYPE state. */
1676
                            $this->emitToken(array(
1677
                                'type' => self::PARSEERROR,
1678
                                'data' => 'expected-space-or-right-bracket-in-doctype'
1679
                            ));
1680
                            $this->token['force-quirks'] = true;
1681
                            $this->token['error'] = true;
1682
                            $state = 'bogus DOCTYPE';
1683
                        }
1684
                    }
1685
                break;
1686
1687 View Code Duplication
                case 'before DOCTYPE public identifier':
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1688
                    /* Consume the next input character: */
1689
                    $char = $this->stream->char();
1690
1691
                    if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1692
                        /* U+0009 CHARACTER TABULATION
1693
                           U+000A LINE FEED (LF)
1694
                           U+000C FORM FEED (FF)
1695
                           U+0020 SPACE
1696
                        Stay in the before DOCTYPE public identifier state. */
1697
                    } elseif ($char === '"') {
1698
                        /* U+0022 QUOTATION MARK (")
1699
                        Set the DOCTYPE token's public identifier to the empty
1700
                        string (not missing), then switch to the DOCTYPE public
1701
                        identifier (double-quoted) state. */
1702
                        $this->token['public'] = '';
1703
                        $state = 'DOCTYPE public identifier (double-quoted)';
1704
                    } elseif ($char === "'") {
1705
                        /* U+0027 APOSTROPHE (')
1706
                        Set the DOCTYPE token's public identifier to the empty
1707
                        string (not missing), then switch to the DOCTYPE public
1708
                        identifier (single-quoted) state. */
1709
                        $this->token['public'] = '';
1710
                        $state = 'DOCTYPE public identifier (single-quoted)';
1711
                    } elseif ($char === '>') {
1712
                        /* Parse error. Set the DOCTYPE token's force-quirks flag
1713
                        to on. Emit that DOCTYPE token. Switch to the data state. */
1714
                        $this->emitToken(array(
1715
                            'type' => self::PARSEERROR,
1716
                            'data' => 'unexpected-end-of-doctype'
1717
                        ));
1718
                        $this->token['force-quirks'] = true;
1719
                        $this->emitToken($this->token);
1720
                        $state = 'data';
1721
                    } elseif ($char === false) {
1722
                        /* Parse error. Set the DOCTYPE token's force-quirks
1723
                        flag to on. Emit that DOCTYPE token. Reconsume the EOF
1724
                        character in the data state. */
1725
                        $this->emitToken(array(
1726
                            'type' => self::PARSEERROR,
1727
                            'data' => 'eof-in-doctype'
1728
                        ));
1729
                        $this->token['force-quirks'] = true;
1730
                        $this->emitToken($this->token);
1731
                        $this->stream->unget();
1732
                        $state = 'data';
1733
                    } else {
1734
                        /* Parse error. Set the DOCTYPE token's force-quirks flag
1735
                        to on. Switch to the bogus DOCTYPE state. */
1736
                        $this->emitToken(array(
1737
                            'type' => self::PARSEERROR,
1738
                            'data' => 'unexpected-char-in-doctype'
1739
                        ));
1740
                        $this->token['force-quirks'] = true;
1741
                        $state = 'bogus DOCTYPE';
1742
                    }
1743
                break;
1744
1745 View Code Duplication
                case 'DOCTYPE public identifier (double-quoted)':
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1746
                    /* Consume the next input character: */
1747
                    $char = $this->stream->char();
1748
1749
                    if ($char === '"') {
1750
                        /* U+0022 QUOTATION MARK (")
1751
                        Switch to the after DOCTYPE public identifier state. */
1752
                        $state = 'after DOCTYPE public identifier';
1753
                    } elseif ($char === '>') {
1754
                        /* U+003E GREATER-THAN SIGN (>)
1755
                        Parse error. Set the DOCTYPE token's force-quirks flag
1756
                        to on. Emit that DOCTYPE token. Switch to the data state. */
1757
                        $this->emitToken(array(
1758
                            'type' => self::PARSEERROR,
1759
                            'data' => 'unexpected-end-of-doctype'
1760
                        ));
1761
                        $this->token['force-quirks'] = true;
1762
                        $this->emitToken($this->token);
1763
                        $state = 'data';
1764
                    } elseif ($char === false) {
1765
                        /* EOF
1766
                        Parse error. Set the DOCTYPE token's force-quirks flag
1767
                        to on. Emit that DOCTYPE token. Reconsume the EOF
1768
                        character in the data state. */
1769
                        $this->emitToken(array(
1770
                            'type' => self::PARSEERROR,
1771
                            'data' => 'eof-in-doctype'
1772
                        ));
1773
                        $this->token['force-quirks'] = true;
1774
                        $this->emitToken($this->token);
1775
                        $this->stream->unget();
1776
                        $state = 'data';
1777
                    } else {
1778
                        /* Anything else
1779
                        Append the current input character to the current
1780
                        DOCTYPE token's public identifier. Stay in the DOCTYPE
1781
                        public identifier (double-quoted) state. */
1782
                        $this->token['public'] .= $char;
1783
                    }
1784
                break;
1785
1786 View Code Duplication
                case 'DOCTYPE public identifier (single-quoted)':
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1787
                    /* Consume the next input character: */
1788
                    $char = $this->stream->char();
1789
1790
                    if ($char === "'") {
1791
                        /* U+0027 APOSTROPHE (')
1792
                        Switch to the after DOCTYPE public identifier state. */
1793
                        $state = 'after DOCTYPE public identifier';
1794
                    } elseif ($char === '>') {
1795
                        /* U+003E GREATER-THAN SIGN (>)
1796
                        Parse error. Set the DOCTYPE token's force-quirks flag
1797
                        to on. Emit that DOCTYPE token. Switch to the data state. */
1798
                        $this->emitToken(array(
1799
                            'type' => self::PARSEERROR,
1800
                            'data' => 'unexpected-end-of-doctype'
1801
                        ));
1802
                        $this->token['force-quirks'] = true;
1803
                        $this->emitToken($this->token);
1804
                        $state = 'data';
1805
                    } elseif ($char === false) {
1806
                        /* EOF
1807
                        Parse error. Set the DOCTYPE token's force-quirks flag
1808
                        to on. Emit that DOCTYPE token. Reconsume the EOF
1809
                        character in the data state. */
1810
                        $this->emitToken(array(
1811
                            'type' => self::PARSEERROR,
1812
                            'data' => 'eof-in-doctype'
1813
                        ));
1814
                        $this->token['force-quirks'] = true;
1815
                        $this->emitToken($this->token);
1816
                        $this->stream->unget();
1817
                        $state = 'data';
1818
                    } else {
1819
                        /* Anything else
1820
                        Append the current input character to the current
1821
                        DOCTYPE token's public identifier. Stay in the DOCTYPE
1822
                        public identifier (double-quoted) state. */
1823
                        $this->token['public'] .= $char;
1824
                    }
1825
                break;
1826
1827
                case 'after DOCTYPE public identifier':
1828
                    /* Consume the next input character: */
1829
                    $char = $this->stream->char();
1830
1831
                    if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1832
                        /* U+0009 CHARACTER TABULATION
1833
                           U+000A LINE FEED (LF)
1834
                           U+000C FORM FEED (FF)
1835
                           U+0020 SPACE
1836
                        Stay in the after DOCTYPE public identifier state. */
1837
                    } elseif ($char === '"') {
1838
                        /* U+0022 QUOTATION MARK (")
1839
                        Set the DOCTYPE token's system identifier to the
1840
                        empty string (not missing), then switch to the DOCTYPE
1841
                        system identifier (double-quoted) state. */
1842
                        $this->token['system'] = '';
1843
                        $state = 'DOCTYPE system identifier (double-quoted)';
1844
                    } elseif ($char === "'") {
1845
                        /* U+0027 APOSTROPHE (')
1846
                        Set the DOCTYPE token's system identifier to the
1847
                        empty string (not missing), then switch to the DOCTYPE
1848
                        system identifier (single-quoted) state. */
1849
                        $this->token['system'] = '';
1850
                        $state = 'DOCTYPE system identifier (single-quoted)';
1851
                    } elseif ($char === '>') {
1852
                        /* U+003E GREATER-THAN SIGN (>)
1853
                        Emit the current DOCTYPE token. Switch to the data state. */
1854
                        $this->emitToken($this->token);
1855
                        $state = 'data';
1856
                    } elseif ($char === false) {
1857
                        /* Parse error. Set the DOCTYPE token's force-quirks
1858
                        flag to on. Emit that DOCTYPE token. Reconsume the EOF
1859
                        character in the data state. */
1860
                        $this->emitToken(array(
1861
                            'type' => self::PARSEERROR,
1862
                            'data' => 'eof-in-doctype'
1863
                        ));
1864
                        $this->token['force-quirks'] = true;
1865
                        $this->emitToken($this->token);
1866
                        $this->stream->unget();
1867
                        $state = 'data';
1868
                    } else {
1869
                        /* Anything else
1870
                        Parse error. Set the DOCTYPE token's force-quirks flag
1871
                        to on. Switch to the bogus DOCTYPE state. */
1872
                        $this->emitToken(array(
1873
                            'type' => self::PARSEERROR,
1874
                            'data' => 'unexpected-char-in-doctype'
1875
                        ));
1876
                        $this->token['force-quirks'] = true;
1877
                        $state = 'bogus DOCTYPE';
1878
                    }
1879
                break;
1880
1881 View Code Duplication
                case 'before DOCTYPE system identifier':
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1882
                    /* Consume the next input character: */
1883
                    $char = $this->stream->char();
1884
1885
                    if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1886
                        /* U+0009 CHARACTER TABULATION
1887
                           U+000A LINE FEED (LF)
1888
                           U+000C FORM FEED (FF)
1889
                           U+0020 SPACE
1890
                        Stay in the before DOCTYPE system identifier state. */
1891
                    } elseif ($char === '"') {
1892
                        /* U+0022 QUOTATION MARK (")
1893
                        Set the DOCTYPE token's system identifier to the empty
1894
                        string (not missing), then switch to the DOCTYPE system
1895
                        identifier (double-quoted) state. */
1896
                        $this->token['system'] = '';
1897
                        $state = 'DOCTYPE system identifier (double-quoted)';
1898
                    } elseif ($char === "'") {
1899
                        /* U+0027 APOSTROPHE (')
1900
                        Set the DOCTYPE token's system identifier to the empty
1901
                        string (not missing), then switch to the DOCTYPE system
1902
                        identifier (single-quoted) state. */
1903
                        $this->token['system'] = '';
1904
                        $state = 'DOCTYPE system identifier (single-quoted)';
1905
                    } elseif ($char === '>') {
1906
                        /* Parse error. Set the DOCTYPE token's force-quirks flag
1907
                        to on. Emit that DOCTYPE token. Switch to the data state. */
1908
                        $this->emitToken(array(
1909
                            'type' => self::PARSEERROR,
1910
                            'data' => 'unexpected-char-in-doctype'
1911
                        ));
1912
                        $this->token['force-quirks'] = true;
1913
                        $this->emitToken($this->token);
1914
                        $state = 'data';
1915
                    } elseif ($char === false) {
1916
                        /* Parse error. Set the DOCTYPE token's force-quirks
1917
                        flag to on. Emit that DOCTYPE token. Reconsume the EOF
1918
                        character in the data state. */
1919
                        $this->emitToken(array(
1920
                            'type' => self::PARSEERROR,
1921
                            'data' => 'eof-in-doctype'
1922
                        ));
1923
                        $this->token['force-quirks'] = true;
1924
                        $this->emitToken($this->token);
1925
                        $this->stream->unget();
1926
                        $state = 'data';
1927
                    } else {
1928
                        /* Parse error. Set the DOCTYPE token's force-quirks flag
1929
                        to on. Switch to the bogus DOCTYPE state. */
1930
                        $this->emitToken(array(
1931
                            'type' => self::PARSEERROR,
1932
                            'data' => 'unexpected-char-in-doctype'
1933
                        ));
1934
                        $this->token['force-quirks'] = true;
1935
                        $state = 'bogus DOCTYPE';
1936
                    }
1937
                break;
1938
1939 View Code Duplication
                case 'DOCTYPE system identifier (double-quoted)':
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1940
                    /* Consume the next input character: */
1941
                    $char = $this->stream->char();
1942
1943
                    if ($char === '"') {
1944
                        /* U+0022 QUOTATION MARK (")
1945
                        Switch to the after DOCTYPE system identifier state. */
1946
                        $state = 'after DOCTYPE system identifier';
1947
                    } elseif ($char === '>') {
1948
                        /* U+003E GREATER-THAN SIGN (>)
1949
                        Parse error. Set the DOCTYPE token's force-quirks flag
1950
                        to on. Emit that DOCTYPE token. Switch to the data state. */
1951
                        $this->emitToken(array(
1952
                            'type' => self::PARSEERROR,
1953
                            'data' => 'unexpected-end-of-doctype'
1954
                        ));
1955
                        $this->token['force-quirks'] = true;
1956
                        $this->emitToken($this->token);
1957
                        $state = 'data';
1958
                    } elseif ($char === false) {
1959
                        /* EOF
1960
                        Parse error. Set the DOCTYPE token's force-quirks flag
1961
                        to on. Emit that DOCTYPE token. Reconsume the EOF
1962
                        character in the data state. */
1963
                        $this->emitToken(array(
1964
                            'type' => self::PARSEERROR,
1965
                            'data' => 'eof-in-doctype'
1966
                        ));
1967
                        $this->token['force-quirks'] = true;
1968
                        $this->emitToken($this->token);
1969
                        $this->stream->unget();
1970
                        $state = 'data';
1971
                    } else {
1972
                        /* Anything else
1973
                        Append the current input character to the current
1974
                        DOCTYPE token's system identifier. Stay in the DOCTYPE
1975
                        system identifier (double-quoted) state. */
1976
                        $this->token['system'] .= $char;
1977
                    }
1978
                break;
1979
1980 View Code Duplication
                case 'DOCTYPE system identifier (single-quoted)':
0 ignored issues
show
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
1981
                    /* Consume the next input character: */
1982
                    $char = $this->stream->char();
1983
1984
                    if ($char === "'") {
1985
                        /* U+0027 APOSTROPHE (')
1986
                        Switch to the after DOCTYPE system identifier state. */
1987
                        $state = 'after DOCTYPE system identifier';
1988
                    } elseif ($char === '>') {
1989
                        /* U+003E GREATER-THAN SIGN (>)
1990
                        Parse error. Set the DOCTYPE token's force-quirks flag
1991
                        to on. Emit that DOCTYPE token. Switch to the data state. */
1992
                        $this->emitToken(array(
1993
                            'type' => self::PARSEERROR,
1994
                            'data' => 'unexpected-end-of-doctype'
1995
                        ));
1996
                        $this->token['force-quirks'] = true;
1997
                        $this->emitToken($this->token);
1998
                        $state = 'data';
1999
                    } elseif ($char === false) {
2000
                        /* EOF
2001
                        Parse error. Set the DOCTYPE token's force-quirks flag
2002
                        to on. Emit that DOCTYPE token. Reconsume the EOF
2003
                        character in the data state. */
2004
                        $this->emitToken(array(
2005
                            'type' => self::PARSEERROR,
2006
                            'data' => 'eof-in-doctype'
2007
                        ));
2008
                        $this->token['force-quirks'] = true;
2009
                        $this->emitToken($this->token);
2010
                        $this->stream->unget();
2011
                        $state = 'data';
2012
                    } else {
2013
                        /* Anything else
2014
                        Append the current input character to the current
2015
                        DOCTYPE token's system identifier. Stay in the DOCTYPE
2016
                        system identifier (double-quoted) state. */
2017
                        $this->token['system'] .= $char;
2018
                    }
2019
                break;
2020
2021
                case 'after DOCTYPE system identifier':
2022
                    /* Consume the next input character: */
2023
                    $char = $this->stream->char();
2024
2025
                    if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
2026
                        /* U+0009 CHARACTER TABULATION
2027
                           U+000A LINE FEED (LF)
2028
                           U+000C FORM FEED (FF)
2029
                           U+0020 SPACE
2030
                        Stay in the after DOCTYPE system identifier state. */
2031
                    } elseif ($char === '>') {
2032
                        /* U+003E GREATER-THAN SIGN (>)
2033
                        Emit the current DOCTYPE token. Switch to the data state. */
2034
                        $this->emitToken($this->token);
2035
                        $state = 'data';
2036
                    } elseif ($char === false) {
2037
                        /* Parse error. Set the DOCTYPE token's force-quirks
2038
                        flag to on. Emit that DOCTYPE token. Reconsume the EOF
2039
                        character in the data state. */
2040
                        $this->emitToken(array(
2041
                            'type' => self::PARSEERROR,
2042
                            'data' => 'eof-in-doctype'
2043
                        ));
2044
                        $this->token['force-quirks'] = true;
2045
                        $this->emitToken($this->token);
2046
                        $this->stream->unget();
2047
                        $state = 'data';
2048
                    } else {
2049
                        /* Anything else
2050
                        Parse error. Switch to the bogus DOCTYPE state.
2051
                        (This does not set the DOCTYPE token's force-quirks
2052
                        flag to on.) */
2053
                        $this->emitToken(array(
2054
                            'type' => self::PARSEERROR,
2055
                            'data' => 'unexpected-char-in-doctype'
2056
                        ));
2057
                        $state = 'bogus DOCTYPE';
2058
                    }
2059
                break;
2060
2061
                case 'bogus DOCTYPE':
2062
                    /* Consume the next input character: */
2063
                    $char = $this->stream->char();
2064
2065
                    if ($char === '>') {
2066
                        /* U+003E GREATER-THAN SIGN (>)
2067
                        Emit the DOCTYPE token. Switch to the data state. */
2068
                        $this->emitToken($this->token);
2069
                        $state = 'data';
2070
2071
                    } elseif($char === false) {
2072
                        /* EOF
2073
                        Emit the DOCTYPE token. Reconsume the EOF character in
2074
                        the data state. */
2075
                        $this->emitToken($this->token);
2076
                        $this->stream->unget();
2077
                        $state = 'data';
2078
2079
                    } else {
2080
                        /* Anything else
2081
                        Stay in the bogus DOCTYPE state. */
2082
                    }
2083
                break;
2084
2085
                // case 'cdataSection':
2086
2087
            }
2088
        }
2089
    }
2090
2091
    /**
2092
     * Returns a serialized representation of the tree.
2093
     */
2094
    public function save() {
2095
        return $this->tree->save();
2096
    }
2097
2098
    /**
2099
     * Returns the input stream.
2100
     */
2101
    public function stream() {
2102
        return $this->stream;
2103
    }
2104
2105
    private function consumeCharacterReference($allowed = false, $inattr = false) {
2106
        // This goes quite far against spec, and is far closer to the Python
2107
        // impl., mainly because we don't do the large unconsuming the spec
2108
        // requires.
2109
2110
        // All consumed characters.
2111
        $chars = $this->stream->char();
2112
2113
        /* This section defines how to consume a character
2114
        reference. This definition is used when parsing character
2115
        references in text and in attributes.
2116
2117
        The behavior depends on the identity of the next character
2118
        (the one immediately after the U+0026 AMPERSAND character): */
2119
2120
        if (
2121
            $chars[0] === "\x09" ||
2122
            $chars[0] === "\x0A" ||
2123
            $chars[0] === "\x0C" ||
2124
            $chars[0] === "\x20" ||
2125
            $chars[0] === '<' ||
2126
            $chars[0] === '&' ||
2127
            $chars === false ||
2128
            $chars[0] === $allowed
2129
        ) {
2130
            /* U+0009 CHARACTER TABULATION
2131
               U+000A LINE FEED (LF)
2132
               U+000C FORM FEED (FF)
2133
               U+0020 SPACE
2134
               U+003C LESS-THAN SIGN
2135
               U+0026 AMPERSAND
2136
               EOF
2137
               The additional allowed character, if there is one
2138
            Not a character reference. No characters are consumed,
2139
            and nothing is returned. (This is not an error, either.) */
2140
            // We already consumed, so unconsume.
2141
            $this->stream->unget();
2142
            return '&';
2143
        } elseif ($chars[0] === '#') {
2144
            /* Consume the U+0023 NUMBER SIGN. */
2145
            // Um, yeah, we already did that.
2146
            /* The behavior further depends on the character after
2147
            the U+0023 NUMBER SIGN: */
2148
            $chars .= $this->stream->char();
2149
            if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) {
2150
                /* U+0078 LATIN SMALL LETTER X
2151
                   U+0058 LATIN CAPITAL LETTER X */
2152
                /* Consume the X. */
2153
                // Um, yeah, we already did that.
2154
                /* Follow the steps below, but using the range of
2155
                characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2156
                NINE, U+0061 LATIN SMALL LETTER A through to U+0066
2157
                LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
2158
                A, through to U+0046 LATIN CAPITAL LETTER F (in other
2159
                words, 0123456789, ABCDEF, abcdef). */
2160
                $char_class = self::HEX;
2161
                /* When it comes to interpreting the
2162
                number, interpret it as a hexadecimal number. */
2163
                $hex = true;
2164
            } else {
2165
                /* Anything else */
2166
                // Unconsume because we shouldn't have consumed this.
2167
                $chars = $chars[0];
2168
                $this->stream->unget();
2169
                /* Follow the steps below, but using the range of
2170
                characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2171
                NINE (i.e. just 0123456789). */
2172
                $char_class = self::DIGIT;
2173
                /* When it comes to interpreting the number,
2174
                interpret it as a decimal number. */
2175
                $hex = false;
2176
            }
2177
2178
            /* Consume as many characters as match the range of characters given above. */
2179
            $consumed = $this->stream->charsWhile($char_class);
2180
            if ($consumed === '' || $consumed === false) {
2181
                /* If no characters match the range, then don't consume
2182
                any characters (and unconsume the U+0023 NUMBER SIGN
2183
                character and, if appropriate, the X character). This
2184
                is a parse error; nothing is returned. */
2185
                $this->emitToken(array(
2186
                    'type' => self::PARSEERROR,
2187
                    'data' => 'expected-numeric-entity'
2188
                ));
2189
                return '&' . $chars;
2190
            } else {
2191
                /* Otherwise, if the next character is a U+003B SEMICOLON,
2192
                consume that too. If it isn't, there is a parse error. */
2193
                if ($this->stream->char() !== ';') {
2194
                    $this->stream->unget();
2195
                    $this->emitToken(array(
2196
                        'type' => self::PARSEERROR,
2197
                        'data' => 'numeric-entity-without-semicolon'
2198
                    ));
2199
                }
2200
2201
                /* If one or more characters match the range, then take
2202
                them all and interpret the string of characters as a number
2203
                (either hexadecimal or decimal as appropriate). */
2204
                $codepoint = $hex ? hexdec($consumed) : (int) $consumed;
2205
2206
                /* If that number is one of the numbers in the first column
2207
                of the following table, then this is a parse error. Find the
2208
                row with that number in the first column, and return a
2209
                character token for the Unicode character given in the
2210
                second column of that row. */
2211
                $new_codepoint = HTML5_Data::getRealCodepoint($codepoint);
2212
                if ($new_codepoint) {
2213
                    $this->emitToken(array(
2214
                        'type' => self::PARSEERROR,
2215
                        'data' => 'illegal-windows-1252-entity'
2216
                    ));
2217
                    return HTML5_Data::utf8chr($new_codepoint);
2218
                } else {
2219
                    /* Otherwise, if the number is greater than 0x10FFFF, then 
2220
                     * this is a parse error. Return a U+FFFD REPLACEMENT 
2221
                     * CHARACTER. */
2222
                    if ($codepoint > 0x10FFFF) {
2223
                        $this->emitToken(array(
2224
                            'type' => self::PARSEERROR,
2225
                            'data' => 'overlong-character-entity' // XXX probably not correct
2226
                        ));
2227
                        return "\xEF\xBF\xBD";
2228
                    }
2229
                    /* Otherwise, return a character token for the Unicode 
2230
                     * character whose code point is that number.  If the 
2231
                     * number is in the range 0x0001 to 0x0008,    0x000E to 
2232
                     * 0x001F,  0x007F  to 0x009F, 0xD800 to 0xDFFF, 0xFDD0 to 
2233
                     * 0xFDEF, or is one of 0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, 
2234
                     * 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 
2235
                     * 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 
2236
                     * 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 
2237
                     * 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 
2238
                     * 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 
2239
                     * or 0x10FFFF, then this is a parse error. */
2240
                    // && has higher precedence than ||
2241
                    if (
2242
                        $codepoint >= 0x0000 && $codepoint <= 0x0008 ||
2243
                        $codepoint === 0x000B ||
2244
                        $codepoint >= 0x000E && $codepoint <= 0x001F ||
2245
                        $codepoint >= 0x007F && $codepoint <= 0x009F ||
2246
                        $codepoint >= 0xD800 && $codepoint <= 0xDFFF ||
2247
                        $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF ||
2248
                        ($codepoint & 0xFFFE) === 0xFFFE ||
2249
                        $codepoint == 0x10FFFF || $codepoint == 0x10FFFE
2250
                    ) {
2251
                        $this->emitToken(array(
2252
                            'type' => self::PARSEERROR,
2253
                            'data' => 'illegal-codepoint-for-numeric-entity'
2254
                        ));
2255
                    }
2256
                    return HTML5_Data::utf8chr($codepoint);
2257
                }
2258
            }
2259
2260
        } else {
2261
            /* Anything else */
2262
2263
            /* Consume the maximum number of characters possible,
2264
            with the consumed characters matching one of the
2265
            identifiers in the first column of the named character
2266
            references table (in a case-sensitive manner). */
2267
            // What we actually do here is consume as much as we can while it
2268
            // matches the start of one of the identifiers in the first column.
2269
2270
            $refs = HTML5_Data::getNamedCharacterReferences();
2271
            
2272
            // Get the longest string which is the start of an identifier
2273
            // ($chars) as well as the longest identifier which matches ($id)
2274
            // and its codepoint ($codepoint).
2275
            $codepoint = false;
2276
            $char = $chars;
2277
            while ($char !== false && isset($refs[$char])) {
2278
                $refs = $refs[$char];
2279
                if (isset($refs['codepoint'])) {
2280
                    $id = $chars;
2281
                    $codepoint = $refs['codepoint'];
2282
                }
2283
                $chars .= $char = $this->stream->char();
2284
            }
2285
            
2286
            // Unconsume the one character we just took which caused the while
2287
            // statement to fail. This could be anything and could cause state
2288
            // changes (as if it matches the while loop it must be
2289
            // alphanumeric so we can just concat it to whatever we get later).
2290
            $this->stream->unget();
2291
            if ($char !== false) {
2292
                $chars = substr($chars, 0, -1);
2293
            }
2294
2295
            /* If no match can be made, then this is a parse error.
2296
            No characters are consumed, and nothing is returned. */
2297
            if (!$codepoint) {
2298
                $this->emitToken(array(
2299
                    'type' => self::PARSEERROR,
2300
                    'data' => 'expected-named-entity'
2301
                ));
2302
                return '&' . $chars;
2303
            }
2304
2305
            /* If the last character matched is not a U+003B SEMICOLON
2306
            (;), there is a parse error. */
2307
            $semicolon = true;
2308
            if (substr($id, -1) !== ';') {
2309
                $this->emitToken(array(
2310
                    'type' => self::PARSEERROR,
2311
                    'data' => 'named-entity-without-semicolon'
2312
                ));
2313
                $semicolon = false;
2314
            }
2315
2316
            /* If the character reference is being consumed as part of
2317
            an attribute, and the last character matched is not a
2318
            U+003B SEMICOLON (;), and the next character is in the
2319
            range U+0030 DIGIT ZERO to U+0039 DIGIT NINE, U+0041
2320
            LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z,
2321
            or U+0061 LATIN SMALL LETTER A to U+007A LATIN SMALL LETTER Z,
2322
            then, for historical reasons, all the characters that were
2323
            matched after the U+0026 AMPERSAND (&) must be unconsumed,
2324
            and nothing is returned. */
2325
            if ($inattr && !$semicolon) {
2326
                // The next character is either the next character in $chars or in the stream.
2327
                if (strlen($chars) > strlen($id)) {
2328
                    $next = substr($chars, strlen($id), 1);
2329
                } else {
2330
                    $next = $this->stream->char();
2331
                    $this->stream->unget();
2332
                }
2333
                if (
2334
                    '0' <= $next && $next <= '9' ||
2335
                    'A' <= $next && $next <= 'Z' ||
2336
                    'a' <= $next && $next <= 'z'
2337
                ) {
2338
                    return '&' . $chars;
2339
                }
2340
            }
2341
2342
            /* Otherwise, return a character token for the character
2343
            corresponding to the character reference name (as given
2344
            by the second column of the named character references table). */
2345
            return HTML5_Data::utf8chr($codepoint) . substr($chars, strlen($id));
2346
        }
2347
    }
2348
2349
    private function characterReferenceInAttributeValue($allowed = false) {
2350
        /* Attempt to consume a character reference. */
2351
        $entity = $this->consumeCharacterReference($allowed, true);
2352
2353
        /* If nothing is returned, append a U+0026 AMPERSAND
2354
        character to the current attribute's value.
2355
2356
        Otherwise, append the returned character token to the
2357
        current attribute's value. */
2358
        $char = (!$entity)
2359
            ? '&'
2360
            : $entity;
2361
2362
        $last = count($this->token['attr']) - 1;
2363
        $this->token['attr'][$last]['value'] .= $char;
2364
2365
        /* Finally, switch back to the attribute value state that you
2366
        were in when were switched into this state. */
2367
    }
2368
2369
    /**
2370
     * Emits a token, passing it on to the tree builder.
2371
     */
2372
    protected function emitToken($token, $checkStream = true, $dry = false) {
2373
        if ($checkStream) {
2374
            // Emit errors from input stream.
2375
            while ($this->stream->errors) {
2376
                $this->emitToken(array_shift($this->stream->errors), false);
2377
            }
2378
        }
2379
        if($token['type'] === self::ENDTAG && !empty($token['attr'])) {
2380
            for ($i = 0; $i < count($token['attr']); $i++) {
2381
                $this->emitToken(array(
2382
                    'type' => self::PARSEERROR,
2383
                    'data' => 'attributes-in-end-tag'
2384
                ));
2385
            }
2386
        }
2387
        if($token['type'] === self::ENDTAG && !empty($token['self-closing'])) {
2388
            $this->emitToken(array(
2389
                'type' => self::PARSEERROR,
2390
                'data' => 'self-closing-flag-on-end-tag',
2391
            ));
2392
        }
2393
        if($token['type'] === self::STARTTAG) {
2394
            // This could be changed to actually pass the tree-builder a hash
2395
            $hash = array();
2396
            foreach ($token['attr'] as $keypair) {
2397
                if (isset($hash[$keypair['name']])) {
2398
                    $this->emitToken(array(
2399
                        'type' => self::PARSEERROR,
2400
                        'data' => 'duplicate-attribute',
2401
                    ));
2402
                } else {
2403
                    $hash[$keypair['name']] = $keypair['value'];
2404
                }
2405
            }
2406
        }
2407
2408
        if(!$dry) {
2409
            // the current structure of attributes is not a terribly good one
2410
            $this->tree->emitToken($token);
2411
        }
2412
2413
        if(!$dry && is_int($this->tree->content_model)) {
2414
            $this->content_model = $this->tree->content_model;
2415
            $this->tree->content_model = null;
2416
2417
        } elseif($token['type'] === self::ENDTAG) {
2418
            $this->content_model = self::PCDATA;
2419
        }
2420
    }
2421
}
2422
2423