Completed
Branch development (b1b115)
by Johannes
10:28
created

HTML5_Tokenizer   F

Complexity

Total Complexity 358

Size/Duplication

Total Lines 2431
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 358
c 0
b 0
f 0
dl 0
loc 2431
rs 0.8

How to fix   Complexity   

Complex Class

Complex classes like HTML5_Tokenizer often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use HTML5_Tokenizer, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
/*
4
5
Copyright 2007 Jeroen van der Meer <http://jero.net/>
6
Copyright 2008 Edward Z. Yang <http://htmlpurifier.org/>
7
Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
8
9
Permission is hereby granted, free of charge, to any person obtaining a
10
copy of this software and associated documentation files (the
11
"Software"), to deal in the Software without restriction, including
12
without limitation the rights to use, copy, modify, merge, publish,
13
distribute, sublicense, and/or sell copies of the Software, and to
14
permit persons to whom the Software is furnished to do so, subject to
15
the following conditions:
16
17
The above copyright notice and this permission notice shall be included
18
in all copies or substantial portions of the Software.
19
20
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
21
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
24
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
25
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
26
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
28
*/
29
30
// Some conventions:
31
// /* */ indicates verbatim text from the HTML 5 specification
32
// // indicates regular comments
33
34
// all flags are in hyphenated form
35
36
class HTML5_Tokenizer {
37
    /**
38
     * @var HTML5_InputStream
39
     *
40
     * Points to an InputStream object.
41
     */
42
    protected $stream;
43
44
    /**
45
     * @var HTML5_TreeBuilder
46
     *
47
     * Tree builder that the tokenizer emits token to.
48
     */
49
    private $tree;
50
51
    /**
52
     * @var int
53
     *
54
     * Current content model we are parsing as.
55
     */
56
    protected $content_model;
57
58
    /**
59
     * Current token that is being built, but not yet emitted. Also
60
     * is the last token emitted, if applicable.
61
     */
62
    protected $token;
63
64
    // These are constants describing the content model
65
    const PCDATA    = 0;
66
    const RCDATA    = 1;
67
    const CDATA     = 2;
68
    const PLAINTEXT = 3;
69
70
    // These are constants describing tokens
71
    // XXX should probably be moved somewhere else, probably the
72
    // HTML5 class.
73
    const DOCTYPE        = 0;
74
    const STARTTAG       = 1;
75
    const ENDTAG         = 2;
76
    const COMMENT        = 3;
77
    const CHARACTER      = 4;
78
    const SPACECHARACTER = 5;
79
    const EOF            = 6;
80
    const PARSEERROR     = 7;
81
82
    // These are constants representing bunches of characters.
83
    const ALPHA       = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
84
    const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
85
    const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz';
86
    const DIGIT       = '0123456789';
87
    const HEX         = '0123456789ABCDEFabcdef';
88
    const WHITESPACE  = "\t\n\x0c ";
89
90
    /**
91
     * @param $data | Data to parse
92
     * @param HTML5_TreeBuilder|null $builder
93
     */
94
    public function __construct($data, $builder = null) {
95
        $this->stream = new HTML5_InputStream($data);
96
        if (!$builder) {
97
            $this->tree = new HTML5_TreeBuilder;
98
        } else {
99
            $this->tree = $builder;
100
        }
101
        $this->content_model = self::PCDATA;
102
    }
103
104
    /**
105
     * @param null $context
106
     */
107
    public function parseFragment($context = null) {
108
        $this->tree->setupContext($context);
109
        if ($this->tree->content_model) {
110
            $this->content_model = $this->tree->content_model;
111
            $this->tree->content_model = null;
112
        }
113
        $this->parse();
114
    }
115
116
    // XXX maybe convert this into an iterator? regardless, this function
117
    // and the save function should go into a Parser facade of some sort
118
    /**
119
     * Performs the actual parsing of the document.
120
     */
121
    public function parse() {
122
        // Current state
123
        $state = 'data';
124
        // This is used to avoid having to have look-behind in the data state.
125
        $lastFourChars = '';
126
        /**
127
         * Escape flag as specified by the HTML5 specification: "used to
128
         * control the behavior of the tokeniser. It is either true or
129
         * false, and initially must be set to the false state."
130
         */
131
        $escape = false;
132
        //echo "\n\n";
133
        while($state !== null) {
134
135
            /*echo $state . ' ';
136
            switch ($this->content_model) {
137
                case self::PCDATA: echo 'PCDATA'; break;
138
                case self::RCDATA: echo 'RCDATA'; break;
139
                case self::CDATA: echo 'CDATA'; break;
140
                case self::PLAINTEXT: echo 'PLAINTEXT'; break;
141
            }
142
            if ($escape) echo " escape";
143
            echo "\n";*/
144
145
            switch($state) {
146
                case 'data':
147
148
                    /* Consume the next input character */
149
                    $char = $this->stream->char();
150
                    $lastFourChars .= $char;
151
                    if (strlen($lastFourChars) > 4) {
152
                        $lastFourChars = substr($lastFourChars, -4);
153
                    }
154
155
                    // see below for meaning
156
                    $hyp_cond =
157
                        !$escape &&
158
                        (
159
                            $this->content_model === self::RCDATA ||
160
                            $this->content_model === self::CDATA
161
                        );
162
                    $amp_cond =
163
                        !$escape &&
164
                        (
165
                            $this->content_model === self::PCDATA ||
166
                            $this->content_model === self::RCDATA
167
                        );
168
                    $lt_cond =
169
                        $this->content_model === self::PCDATA ||
170
                        (
171
                            (
172
                                $this->content_model === self::RCDATA ||
173
                                $this->content_model === self::CDATA
174
                             ) &&
175
                             !$escape
176
                        );
177
                    $gt_cond =
178
                        $escape &&
179
                        (
180
                            $this->content_model === self::RCDATA ||
181
                            $this->content_model === self::CDATA
182
                        );
183
184
                    if ($char === '&' && $amp_cond === true) {
185
                        /* U+0026 AMPERSAND (&)
186
                        When the content model flag is set to one of the PCDATA or RCDATA
187
                        states and the escape flag is false: switch to the
188
                        character reference data state. Otherwise: treat it as per
189
                        the "anything else" entry below. */
190
                        $state = 'character reference data';
191
192
                    } elseif (
193
                        $char === '-' &&
194
                        $hyp_cond === true &&
195
                        $lastFourChars === '<!--'
196
                    ) {
197
                        /*
198
                        U+002D HYPHEN-MINUS (-)
199
                        If the content model flag is set to either the RCDATA state or
200
                        the CDATA state, and the escape flag is false, and there are at
201
                        least three characters before this one in the input stream, and the
202
                        last four characters in the input stream, including this one, are
203
                        U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
204
                        and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
205
                        $escape = true;
206
207
                        /* In any case, emit the input character as a character token. Stay
208
                        in the data state. */
209
                        $this->emitToken(array(
210
                            'type' => self::CHARACTER,
211
                            'data' => '-'
212
                        ));
213
                        // We do the "any case" part as part of "anything else".
214
215
                    /* U+003C LESS-THAN SIGN (<) */
216
                    } elseif ($char === '<' && $lt_cond === true) {
217
                        /* When the content model flag is set to the PCDATA state: switch
218
                        to the tag open state.
219
220
                        When the content model flag is set to either the RCDATA state or
221
                        the CDATA state and the escape flag is false: switch to the tag
222
                        open state.
223
224
                        Otherwise: treat it as per the "anything else" entry below. */
225
                        $state = 'tag open';
226
227
                    /* U+003E GREATER-THAN SIGN (>) */
228
                    } elseif (
229
                        $char === '>' &&
230
                        $gt_cond === true &&
231
                        substr($lastFourChars, 1) === '-->'
232
                    ) {
233
                        /* If the content model flag is set to either the RCDATA state or
234
                        the CDATA state, and the escape flag is true, and the last three
235
                        characters in the input stream including this one are U+002D
236
                        HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
237
                        set the escape flag to false. */
238
                        $escape = false;
239
240
                        /* In any case, emit the input character as a character token.
241
                        Stay in the data state. */
242
                        $this->emitToken(array(
243
                            'type' => self::CHARACTER,
244
                            'data' => '>'
245
                        ));
246
                        // We do the "any case" part as part of "anything else".
247
248
                    } elseif ($char === false) {
249
                        /* EOF
250
                        Emit an end-of-file token. */
251
                        $state = null;
252
                        $this->tree->emitToken(array(
253
                            'type' => self::EOF
254
                        ));
255
256
                    } elseif ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
257
                        // Directly after emitting a token you switch back to the "data
258
                        // state". At that point spaceCharacters are important so they are
259
                        // emitted separately.
260
                        $chars = $this->stream->charsWhile(self::WHITESPACE);
261
                        $this->emitToken(array(
262
                            'type' => self::SPACECHARACTER,
263
                            'data' => $char . $chars
264
                        ));
265
                        $lastFourChars .= $chars;
266
                        if (strlen($lastFourChars) > 4) {
267
                            $lastFourChars = substr($lastFourChars, -4);
268
                        }
269
                    } else {
270
                        /* Anything else
271
                        THIS IS AN OPTIMIZATION: Get as many character that
272
                        otherwise would also be treated as a character token and emit it
273
                        as a single character token. Stay in the data state. */
274
275
                        $mask = '';
276
                        if ($hyp_cond === true) {
277
                            $mask .= '-';
278
                        }
279
                        if ($amp_cond === true) {
280
                            $mask .= '&';
281
                        }
282
                        if ($lt_cond === true) {
283
                            $mask .= '<';
284
                        }
285
                        if ($gt_cond === true) {
286
                            $mask .= '>';
287
                        }
288
289
                        if ($mask === '') {
290
                            $chars = $this->stream->remainingChars();
291
                        } else {
292
                            $chars = $this->stream->charsUntil($mask);
293
                        }
294
295
                        $this->emitToken(array(
296
                            'type' => self::CHARACTER,
297
                            'data' => $char . $chars
298
                        ));
299
300
                        $lastFourChars .= $chars;
301
                        if (strlen($lastFourChars) > 4) {
302
                            $lastFourChars = substr($lastFourChars, -4);
303
                        }
304
305
                        $state = 'data';
306
                    }
307
                break;
308
309
                case 'character reference data':
310
                    /* (This cannot happen if the content model flag
311
                    is set to the CDATA state.) */
312
313
                    /* Attempt to consume a character reference, with no
314
                    additional allowed character. */
315
                    $entity = $this->consumeCharacterReference();
316
317
                    /* If nothing is returned, emit a U+0026 AMPERSAND
318
                    character token. Otherwise, emit the character token that
319
                    was returned. */
320
                    // This is all done when consuming the character reference.
321
                    $this->emitToken(array(
322
                        'type' => self::CHARACTER,
323
                        'data' => $entity
324
                    ));
325
326
                    /* Finally, switch to the data state. */
327
                    $state = 'data';
328
                break;
329
330
                case 'tag open':
331
                    $char = $this->stream->char();
332
333
                    switch ($this->content_model) {
334
                        case self::RCDATA:
335
                        case self::CDATA:
336
                            /* Consume the next input character. If it is a
337
                            U+002F SOLIDUS (/) character, switch to the close
338
                            tag open state. Otherwise, emit a U+003C LESS-THAN
339
                            SIGN character token and reconsume the current input
340
                            character in the data state. */
341
                            // We consumed above.
342
343
                            if ($char === '/') {
344
                                $state = 'close tag open';
345
                            } else {
346
                                $this->emitToken(array(
347
                                    'type' => self::CHARACTER,
348
                                    'data' => '<'
349
                                ));
350
351
                                $this->stream->unget();
352
353
                                $state = 'data';
354
                            }
355
                        break;
356
357
                        case self::PCDATA:
358
                            /* If the content model flag is set to the PCDATA state
359
                            Consume the next input character: */
360
                            // We consumed above.
361
362
                            if ($char === '!') {
363
                                /* U+0021 EXCLAMATION MARK (!)
364
                                Switch to the markup declaration open state. */
365
                                $state = 'markup declaration open';
366
367
                            } elseif ($char === '/') {
368
                                /* U+002F SOLIDUS (/)
369
                                Switch to the close tag open state. */
370
                                $state = 'close tag open';
371
372
                            } elseif ('A' <= $char && $char <= 'Z') {
373
                                /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
374
                                Create a new start tag token, set its tag name to the lowercase
375
                                version of the input character (add 0x0020 to the character's code
376
                                point), then switch to the tag name state. (Don't emit the token
377
                                yet; further details will be filled in before it is emitted.) */
378
                                $this->token = array(
379
                                    'name'  => strtolower($char),
380
                                    'type'  => self::STARTTAG,
381
                                    'attr'  => array()
382
                                );
383
384
                                $state = 'tag name';
385
386
                            } elseif ('a' <= $char && $char <= 'z') {
387
                                /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
388
                                Create a new start tag token, set its tag name to the input
389
                                character, then switch to the tag name state. (Don't emit
390
                                the token yet; further details will be filled in before it
391
                                is emitted.) */
392
                                $this->token = array(
393
                                    'name'  => $char,
394
                                    'type'  => self::STARTTAG,
395
                                    'attr'  => array()
396
                                );
397
398
                                $state = 'tag name';
399
400
                            } elseif ($char === '>') {
401
                                /* U+003E GREATER-THAN SIGN (>)
402
                                Parse error. Emit a U+003C LESS-THAN SIGN character token and a
403
                                U+003E GREATER-THAN SIGN character token. Switch to the data state. */
404
                                $this->emitToken(array(
405
                                    'type' => self::PARSEERROR,
406
                                    'data' => 'expected-tag-name-but-got-right-bracket'
407
                                ));
408
                                $this->emitToken(array(
409
                                    'type' => self::CHARACTER,
410
                                    'data' => '<>'
411
                                ));
412
413
                                $state = 'data';
414
415
                            } elseif ($char === '?') {
416
                                /* U+003F QUESTION MARK (?)
417
                                Parse error. Switch to the bogus comment state. */
418
                                $this->emitToken(array(
419
                                    'type' => self::PARSEERROR,
420
                                    'data' => 'expected-tag-name-but-got-question-mark'
421
                                ));
422
                                $this->token = array(
423
                                    'data' => '?',
424
                                    'type' => self::COMMENT
425
                                );
426
                                $state = 'bogus comment';
427
428
                            } else {
429
                                /* Anything else
430
                                Parse error. Emit a U+003C LESS-THAN SIGN character token and
431
                                reconsume the current input character in the data state. */
432
                                $this->emitToken(array(
433
                                    'type' => self::PARSEERROR,
434
                                    'data' => 'expected-tag-name'
435
                                ));
436
                                $this->emitToken(array(
437
                                    'type' => self::CHARACTER,
438
                                    'data' => '<'
439
                                ));
440
441
                                $state = 'data';
442
                                $this->stream->unget();
443
                            }
444
                        break;
445
                    }
446
                break;
447
448
                case 'close tag open':
449
                    if (
450
                        $this->content_model === self::RCDATA ||
451
                        $this->content_model === self::CDATA
452
                    ) {
453
                        /* If the content model flag is set to the RCDATA or CDATA
454
                        states... */
455
                        $name = strtolower($this->stream->charsWhile(self::ALPHA));
456
                        $following = $this->stream->char();
457
                        $this->stream->unget();
458
                        if (
459
                            !$this->token ||
460
                            $this->token['name'] !== $name ||
461
                            $this->token['name'] === $name && !in_array($following, array("\x09", "\x0A", "\x0C", "\x20", "\x3E", "\x2F", false))
462
                        ) {
463
                            /* if no start tag token has ever been emitted by this instance
464
                            of the tokenizer (fragment case), or, if the next few
465
                            characters do not match the tag name of the last start tag
466
                            token emitted (compared in an ASCII case-insensitive manner),
467
                            or if they do but they are not immediately followed by one of
468
                            the following characters:
469
470
                                * U+0009 CHARACTER TABULATION
471
                                * U+000A LINE FEED (LF)
472
                                * U+000C FORM FEED (FF)
473
                                * U+0020 SPACE
474
                                * U+003E GREATER-THAN SIGN (>)
475
                                * U+002F SOLIDUS (/)
476
                                * EOF
477
478
                            ...then emit a U+003C LESS-THAN SIGN character token, a
479
                            U+002F SOLIDUS character token, and switch to the data
480
                            state to process the next input character. */
481
                            // XXX: Probably ought to replace in_array with $following === x ||...
482
483
                            // We also need to emit $name now we've consumed that, as we
484
                            // know it'll just be emitted as a character token.
485
                            $this->emitToken(array(
486
                                'type' => self::CHARACTER,
487
                                'data' => '</' . $name
488
                            ));
489
490
                            $state = 'data';
491
                        } else {
492
                            // This matches what would happen if we actually did the
493
                            // otherwise below (but we can't because we've consumed too
494
                            // much).
495
496
                            // Start the end tag token with the name we already have.
497
                            $this->token = array(
498
                                'name'  => $name,
499
                                'type'  => self::ENDTAG
500
                            );
501
502
                            // Change to tag name state.
503
                            $state = 'tag name';
504
                        }
505
                    } elseif ($this->content_model === self::PCDATA) {
506
                        /* Otherwise, if the content model flag is set to the PCDATA
507
                        state [...]: */
508
                        $char = $this->stream->char();
509
510
                        if ('A' <= $char && $char <= 'Z') {
511
                            /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
512
                            Create a new end tag token, set its tag name to the lowercase version
513
                            of the input character (add 0x0020 to the character's code point), then
514
                            switch to the tag name state. (Don't emit the token yet; further details
515
                            will be filled in before it is emitted.) */
516
                            $this->token = array(
517
                                'name'  => strtolower($char),
518
                                'type'  => self::ENDTAG
519
                            );
520
521
                            $state = 'tag name';
522
523
                        } elseif ('a' <= $char && $char <= 'z') {
524
                            /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
525
                            Create a new end tag token, set its tag name to the
526
                            input character, then switch to the tag name state.
527
                            (Don't emit the token yet; further details will be
528
                            filled in before it is emitted.) */
529
                            $this->token = array(
530
                                'name'  => $char,
531
                                'type'  => self::ENDTAG
532
                            );
533
534
                            $state = 'tag name';
535
536
                        } elseif ($char === '>') {
537
                            /* U+003E GREATER-THAN SIGN (>)
538
                            Parse error. Switch to the data state. */
539
                            $this->emitToken(array(
540
                                'type' => self::PARSEERROR,
541
                                'data' => 'expected-closing-tag-but-got-right-bracket'
542
                            ));
543
                            $state = 'data';
544
545
                        } elseif ($char === false) {
546
                            /* EOF
547
                            Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
548
                            SOLIDUS character token. Reconsume the EOF character in the data state. */
549
                            $this->emitToken(array(
550
                                'type' => self::PARSEERROR,
551
                                'data' => 'expected-closing-tag-but-got-eof'
552
                            ));
553
                            $this->emitToken(array(
554
                                'type' => self::CHARACTER,
555
                                'data' => '</'
556
                            ));
557
558
                            $this->stream->unget();
559
                            $state = 'data';
560
561
                        } else {
562
                            /* Parse error. Switch to the bogus comment state. */
563
                            $this->emitToken(array(
564
                                'type' => self::PARSEERROR,
565
                                'data' => 'expected-closing-tag-but-got-char'
566
                            ));
567
                            $this->token = array(
568
                                'data' => $char,
569
                                'type' => self::COMMENT
570
                            );
571
                            $state = 'bogus comment';
572
                        }
573
                    }
574
                break;
575
576
                case 'tag name':
577
                    /* Consume the next input character: */
578
                    $char = $this->stream->char();
579
580
                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
581
                        /* U+0009 CHARACTER TABULATION
582
                        U+000A LINE FEED (LF)
583
                        U+000C FORM FEED (FF)
584
                        U+0020 SPACE
585
                        Switch to the before attribute name state. */
586
                        $state = 'before attribute name';
587
588
                    } elseif ($char === '/') {
589
                        /* U+002F SOLIDUS (/)
590
                        Switch to the self-closing start tag state. */
591
                        $state = 'self-closing start tag';
592
593
                    } elseif ($char === '>') {
594
                        /* U+003E GREATER-THAN SIGN (>)
595
                        Emit the current tag token. Switch to the data state. */
596
                        $this->emitToken($this->token);
597
                        $state = 'data';
598
599
                    } elseif ('A' <= $char && $char <= 'Z') {
600
                        /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
601
                        Append the lowercase version of the current input
602
                        character (add 0x0020 to the character's code point) to
603
                        the current tag token's tag name. Stay in the tag name state. */
604
                        $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
605
606
                        $this->token['name'] .= strtolower($char . $chars);
607
                        $state = 'tag name';
608
609
                    } elseif ($char === false) {
610
                        /* EOF
611
                        Parse error. Reconsume the EOF character in the data state. */
612
                        $this->emitToken(array(
613
                            'type' => self::PARSEERROR,
614
                            'data' => 'eof-in-tag-name'
615
                        ));
616
617
                        $this->stream->unget();
618
                        $state = 'data';
619
620
                    } else {
621
                        /* Anything else
622
                        Append the current input character to the current tag token's tag name.
623
                        Stay in the tag name state. */
624
                        $chars = $this->stream->charsUntil("\t\n\x0C />" . self::UPPER_ALPHA);
625
626
                        $this->token['name'] .= $char . $chars;
627
                        $state = 'tag name';
628
                    }
629
                break;
630
631
                case 'before attribute name':
632
                    /* Consume the next input character: */
633
                    $char = $this->stream->char();
634
635
                    // this conditional is optimized, check bottom
636
                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
637
                        /* U+0009 CHARACTER TABULATION
638
                        U+000A LINE FEED (LF)
639
                        U+000C FORM FEED (FF)
640
                        U+0020 SPACE
641
                        Stay in the before attribute name state. */
642
                        $state = 'before attribute name';
643
644
                    } elseif ($char === '/') {
645
                        /* U+002F SOLIDUS (/)
646
                        Switch to the self-closing start tag state. */
647
                        $state = 'self-closing start tag';
648
649
                    } elseif ($char === '>') {
650
                        /* U+003E GREATER-THAN SIGN (>)
651
                        Emit the current tag token. Switch to the data state. */
652
                        $this->emitToken($this->token);
653
                        $state = 'data';
654
655
                    } elseif ('A' <= $char && $char <= 'Z') {
656
                        /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
657
                        Start a new attribute in the current tag token. Set that
658
                        attribute's name to the lowercase version of the current
659
                        input character (add 0x0020 to the character's code
660
                        point), and its value to the empty string. Switch to the
661
                        attribute name state.*/
662
                        $this->token['attr'][] = array(
663
                            'name'  => strtolower($char),
664
                            'value' => ''
665
                        );
666
667
                        $state = 'attribute name';
668
669
                    } elseif ($char === false) {
670
                        /* EOF
671
                        Parse error. Reconsume the EOF character in the data state. */
672
                        $this->emitToken(array(
673
                            'type' => self::PARSEERROR,
674
                            'data' => 'expected-attribute-name-but-got-eof'
675
                        ));
676
677
                        $this->stream->unget();
678
                        $state = 'data';
679
680
                    } else {
681
                        /* U+0022 QUOTATION MARK (")
682
                           U+0027 APOSTROPHE (')
683
                           U+003C LESS-THAN SIGN (<)
684
                           U+003D EQUALS SIGN (=)
685
                        Parse error. Treat it as per the "anything else" entry
686
                        below. */
687
                        if ($char === '"' || $char === "'" || $char === '<' || $char === '=') {
688
                            $this->emitToken(array(
689
                                'type' => self::PARSEERROR,
690
                                'data' => 'invalid-character-in-attribute-name'
691
                            ));
692
                        }
693
694
                        /* Anything else
695
                        Start a new attribute in the current tag token. Set that attribute's
696
                        name to the current input character, and its value to the empty string.
697
                        Switch to the attribute name state. */
698
                        $this->token['attr'][] = array(
699
                            'name'  => $char,
700
                            'value' => ''
701
                        );
702
703
                        $state = 'attribute name';
704
                    }
705
                break;
706
707
                case 'attribute name':
708
                    // Consume the next input character:
709
                    $char = $this->stream->char();
710
711
                    // this conditional is optimized, check bottom
712
                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
713
                        /* U+0009 CHARACTER TABULATION
714
                        U+000A LINE FEED (LF)
715
                        U+000C FORM FEED (FF)
716
                        U+0020 SPACE
717
                        Switch to the after attribute name state. */
718
                        $state = 'after attribute name';
719
720
                    } elseif ($char === '/') {
721
                        /* U+002F SOLIDUS (/)
722
                        Switch to the self-closing start tag state. */
723
                        $state = 'self-closing start tag';
724
725
                    } elseif ($char === '=') {
726
                        /* U+003D EQUALS SIGN (=)
727
                        Switch to the before attribute value state. */
728
                        $state = 'before attribute value';
729
730
                    } elseif ($char === '>') {
731
                        /* U+003E GREATER-THAN SIGN (>)
732
                        Emit the current tag token. Switch to the data state. */
733
                        $this->emitToken($this->token);
734
                        $state = 'data';
735
736
                    } elseif ('A' <= $char && $char <= 'Z') {
737
                        /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
738
                        Append the lowercase version of the current input
739
                        character (add 0x0020 to the character's code point) to
740
                        the current attribute's name. Stay in the attribute name
741
                        state. */
742
                        $chars = $this->stream->charsWhile(self::UPPER_ALPHA);
743
744
                        $last = count($this->token['attr']) - 1;
745
                        $this->token['attr'][$last]['name'] .= strtolower($char . $chars);
746
747
                        $state = 'attribute name';
748
749
                    } elseif ($char === false) {
750
                        /* EOF
751
                        Parse error. Reconsume the EOF character in the data state. */
752
                        $this->emitToken(array(
753
                            'type' => self::PARSEERROR,
754
                            'data' => 'eof-in-attribute-name'
755
                        ));
756
757
                        $this->stream->unget();
758
                        $state = 'data';
759
760
                    } else {
761
                        /* U+0022 QUOTATION MARK (")
762
                           U+0027 APOSTROPHE (')
763
                           U+003C LESS-THAN SIGN (<)
764
                        Parse error. Treat it as per the "anything else"
765
                        entry below. */
766
                        if ($char === '"' || $char === "'" || $char === '<') {
767
                            $this->emitToken(array(
768
                                'type' => self::PARSEERROR,
769
                                'data' => 'invalid-character-in-attribute-name'
770
                            ));
771
                        }
772
773
                        /* Anything else
774
                        Append the current input character to the current attribute's name.
775
                        Stay in the attribute name state. */
776
                        $chars = $this->stream->charsUntil("\t\n\x0C /=>\"'" . self::UPPER_ALPHA);
777
778
                        $last = count($this->token['attr']) - 1;
779
                        $this->token['attr'][$last]['name'] .= $char . $chars;
780
781
                        $state = 'attribute name';
782
                    }
783
784
                    /* When the user agent leaves the attribute name state
785
                    (and before emitting the tag token, if appropriate), the
786
                    complete attribute's name must be compared to the other
787
                    attributes on the same token; if there is already an
788
                    attribute on the token with the exact same name, then this
789
                    is a parse error and the new attribute must be dropped, along
790
                    with the value that gets associated with it (if any). */
791
                    // this might be implemented in the emitToken method
792
                break;
793
794
                case 'after attribute name':
795
                    // Consume the next input character:
796
                    $char = $this->stream->char();
797
798
                    // this is an optimized conditional, check the bottom
799
                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
800
                        /* U+0009 CHARACTER TABULATION
801
                        U+000A LINE FEED (LF)
802
                        U+000C FORM FEED (FF)
803
                        U+0020 SPACE
804
                        Stay in the after attribute name state. */
805
                        $state = 'after attribute name';
806
807
                    } elseif ($char === '/') {
808
                        /* U+002F SOLIDUS (/)
809
                        Switch to the self-closing start tag state. */
810
                        $state = 'self-closing start tag';
811
812
                    } elseif ($char === '=') {
813
                        /* U+003D EQUALS SIGN (=)
814
                        Switch to the before attribute value state. */
815
                        $state = 'before attribute value';
816
817
                    } elseif ($char === '>') {
818
                        /* U+003E GREATER-THAN SIGN (>)
819
                        Emit the current tag token. Switch to the data state. */
820
                        $this->emitToken($this->token);
821
                        $state = 'data';
822
823
                    } elseif ('A' <= $char && $char <= 'Z') {
824
                        /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
825
                        Start a new attribute in the current tag token. Set that
826
                        attribute's name to the lowercase version of the current
827
                        input character (add 0x0020 to the character's code
828
                        point), and its value to the empty string. Switch to the
829
                        attribute name state. */
830
                        $this->token['attr'][] = array(
831
                            'name'  => strtolower($char),
832
                            'value' => ''
833
                        );
834
835
                        $state = 'attribute name';
836
837
                    } elseif ($char === false) {
838
                        /* EOF
839
                        Parse error. Reconsume the EOF character in the data state. */
840
                        $this->emitToken(array(
841
                            'type' => self::PARSEERROR,
842
                            'data' => 'expected-end-of-tag-but-got-eof'
843
                        ));
844
845
                        $this->stream->unget();
846
                        $state = 'data';
847
848
                    } else {
849
                        /* U+0022 QUOTATION MARK (")
850
                           U+0027 APOSTROPHE (')
851
                           U+003C LESS-THAN SIGN(<)
852
                        Parse error. Treat it as per the "anything else"
853
                        entry below. */
854
                        if ($char === '"' || $char === "'" || $char === "<") {
855
                            $this->emitToken(array(
856
                                'type' => self::PARSEERROR,
857
                                'data' => 'invalid-character-after-attribute-name'
858
                            ));
859
                        }
860
861
                        /* Anything else
862
                        Start a new attribute in the current tag token. Set that attribute's
863
                        name to the current input character, and its value to the empty string.
864
                        Switch to the attribute name state. */
865
                        $this->token['attr'][] = array(
866
                            'name'  => $char,
867
                            'value' => ''
868
                        );
869
870
                        $state = 'attribute name';
871
                    }
872
                break;
873
874
                case 'before attribute value':
875
                    // Consume the next input character:
876
                    $char = $this->stream->char();
877
878
                    // this is an optimized conditional
879
                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
880
                        /* U+0009 CHARACTER TABULATION
881
                        U+000A LINE FEED (LF)
882
                        U+000C FORM FEED (FF)
883
                        U+0020 SPACE
884
                        Stay in the before attribute value state. */
885
                        $state = 'before attribute value';
886
887
                    } elseif ($char === '"') {
888
                        /* U+0022 QUOTATION MARK (")
889
                        Switch to the attribute value (double-quoted) state. */
890
                        $state = 'attribute value (double-quoted)';
891
892
                    } elseif ($char === '&') {
893
                        /* U+0026 AMPERSAND (&)
894
                        Switch to the attribute value (unquoted) state and reconsume
895
                        this input character. */
896
                        $this->stream->unget();
897
                        $state = 'attribute value (unquoted)';
898
899
                    } elseif ($char === '\'') {
900
                        /* U+0027 APOSTROPHE (')
901
                        Switch to the attribute value (single-quoted) state. */
902
                        $state = 'attribute value (single-quoted)';
903
904
                    } elseif ($char === '>') {
905
                        /* U+003E GREATER-THAN SIGN (>)
906
                        Parse error. Emit the current tag token. Switch to the data state. */
907
                        $this->emitToken(array(
908
                            'type' => self::PARSEERROR,
909
                            'data' => 'expected-attribute-value-but-got-right-bracket'
910
                        ));
911
                        $this->emitToken($this->token);
912
                        $state = 'data';
913
914
                    } elseif ($char === false) {
915
                        /* EOF
916
                        Parse error. Reconsume the EOF character in the data state. */
917
                        $this->emitToken(array(
918
                            'type' => self::PARSEERROR,
919
                            'data' => 'expected-attribute-value-but-got-eof'
920
                        ));
921
                        $this->stream->unget();
922
                        $state = 'data';
923
924
                    } else {
925
                        /* U+003D EQUALS SIGN (=)
926
                         * U+003C LESS-THAN SIGN (<)
927
                        Parse error. Treat it as per the "anything else" entry below. */
928
                        if ($char === '=' || $char === '<') {
929
                            $this->emitToken(array(
930
                                'type' => self::PARSEERROR,
931
                                'data' => 'equals-in-unquoted-attribute-value'
932
                            ));
933
                        }
934
935
                        /* Anything else
936
                        Append the current input character to the current attribute's value.
937
                        Switch to the attribute value (unquoted) state. */
938
                        $last = count($this->token['attr']) - 1;
939
                        $this->token['attr'][$last]['value'] .= $char;
940
941
                        $state = 'attribute value (unquoted)';
942
                    }
943
                break;
944
945
                case 'attribute value (double-quoted)':
946
                    // Consume the next input character:
947
                    $char = $this->stream->char();
948
949
                    if ($char === '"') {
950
                        /* U+0022 QUOTATION MARK (")
951
                        Switch to the after attribute value (quoted) state. */
952
                        $state = 'after attribute value (quoted)';
953
954
                    } elseif ($char === '&') {
955
                        /* U+0026 AMPERSAND (&)
956
                        Switch to the character reference in attribute value
957
                        state, with the additional allowed character
958
                        being U+0022 QUOTATION MARK ("). */
959
                        $this->characterReferenceInAttributeValue('"');
960
961
                    } elseif ($char === false) {
962
                        /* EOF
963
                        Parse error. Reconsume the EOF character in the data state. */
964
                        $this->emitToken(array(
965
                            'type' => self::PARSEERROR,
966
                            'data' => 'eof-in-attribute-value-double-quote'
967
                        ));
968
969
                        $this->stream->unget();
970
                        $state = 'data';
971
972
                    } else {
973
                        /* Anything else
974
                        Append the current input character to the current attribute's value.
975
                        Stay in the attribute value (double-quoted) state. */
976
                        $chars = $this->stream->charsUntil('"&');
977
978
                        $last = count($this->token['attr']) - 1;
979
                        $this->token['attr'][$last]['value'] .= $char . $chars;
980
981
                        $state = 'attribute value (double-quoted)';
982
                    }
983
                break;
984
985
                case 'attribute value (single-quoted)':
986
                    // Consume the next input character:
987
                    $char = $this->stream->char();
988
989
                    if ($char === "'") {
990
                        /* U+0022 QUOTATION MARK (')
991
                        Switch to the after attribute value state. */
992
                        $state = 'after attribute value (quoted)';
993
994
                    } elseif ($char === '&') {
995
                        /* U+0026 AMPERSAND (&)
996
                        Switch to the entity in attribute value state. */
997
                        $this->characterReferenceInAttributeValue("'");
998
999
                    } elseif ($char === false) {
1000
                        /* EOF
1001
                        Parse error. Reconsume the EOF character in the data state. */
1002
                        $this->emitToken(array(
1003
                            'type' => self::PARSEERROR,
1004
                            'data' => 'eof-in-attribute-value-single-quote'
1005
                        ));
1006
1007
                        $this->stream->unget();
1008
                        $state = 'data';
1009
1010
                    } else {
1011
                        /* Anything else
1012
                        Append the current input character to the current attribute's value.
1013
                        Stay in the attribute value (single-quoted) state. */
1014
                        $chars = $this->stream->charsUntil("'&");
1015
1016
                        $last = count($this->token['attr']) - 1;
1017
                        $this->token['attr'][$last]['value'] .= $char . $chars;
1018
1019
                        $state = 'attribute value (single-quoted)';
1020
                    }
1021
                break;
1022
1023
                case 'attribute value (unquoted)':
1024
                    // Consume the next input character:
1025
                    $char = $this->stream->char();
1026
1027
                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1028
                        /* U+0009 CHARACTER TABULATION
1029
                        U+000A LINE FEED (LF)
1030
                        U+000C FORM FEED (FF)
1031
                        U+0020 SPACE
1032
                        Switch to the before attribute name state. */
1033
                        $state = 'before attribute name';
1034
1035
                    } elseif ($char === '&') {
1036
                        /* U+0026 AMPERSAND (&)
1037
                        Switch to the entity in attribute value state, with the
1038
                        additional allowed character  being U+003E
1039
                        GREATER-THAN SIGN (>). */
1040
                        $this->characterReferenceInAttributeValue('>');
1041
1042
                    } elseif ($char === '>') {
1043
                        /* U+003E GREATER-THAN SIGN (>)
1044
                        Emit the current tag token. Switch to the data state. */
1045
                        $this->emitToken($this->token);
1046
                        $state = 'data';
1047
1048
                    } elseif ($char === false) {
1049
                        /* EOF
1050
                        Parse error. Reconsume the EOF character in the data state. */
1051
                        $this->emitToken(array(
1052
                            'type' => self::PARSEERROR,
1053
                            'data' => 'eof-in-attribute-value-no-quotes'
1054
                        ));
1055
                        $this->stream->unget();
1056
                        $state = 'data';
1057
1058
                    } else {
1059
                        /* U+0022 QUOTATION MARK (")
1060
                           U+0027 APOSTROPHE (')
1061
                           U+003C LESS-THAN SIGN (<)
1062
                           U+003D EQUALS SIGN (=)
1063
                        Parse error. Treat it as per the "anything else"
1064
                        entry below. */
1065
                        if ($char === '"' || $char === "'" || $char === '=' || $char == '<') {
1066
                            $this->emitToken(array(
1067
                                'type' => self::PARSEERROR,
1068
                                'data' => 'unexpected-character-in-unquoted-attribute-value'
1069
                            ));
1070
                        }
1071
1072
                        /* Anything else
1073
                        Append the current input character to the current attribute's value.
1074
                        Stay in the attribute value (unquoted) state. */
1075
                        $chars = $this->stream->charsUntil("\t\n\x0c &>\"'=");
1076
1077
                        $last = count($this->token['attr']) - 1;
1078
                        $this->token['attr'][$last]['value'] .= $char . $chars;
1079
1080
                        $state = 'attribute value (unquoted)';
1081
                    }
1082
                break;
1083
1084
                case 'after attribute value (quoted)':
1085
                    /* Consume the next input character: */
1086
                    $char = $this->stream->char();
1087
1088
                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1089
                        /* U+0009 CHARACTER TABULATION
1090
                           U+000A LINE FEED (LF)
1091
                           U+000C FORM FEED (FF)
1092
                           U+0020 SPACE
1093
                        Switch to the before attribute name state. */
1094
                        $state = 'before attribute name';
1095
1096
                    } elseif ($char === '/') {
1097
                        /* U+002F SOLIDUS (/)
1098
                        Switch to the self-closing start tag state. */
1099
                        $state = 'self-closing start tag';
1100
1101
                    } elseif ($char === '>') {
1102
                        /* U+003E GREATER-THAN SIGN (>)
1103
                        Emit the current tag token. Switch to the data state. */
1104
                        $this->emitToken($this->token);
1105
                        $state = 'data';
1106
1107
                    } elseif ($char === false) {
1108
                        /* EOF
1109
                        Parse error. Reconsume the EOF character in the data state. */
1110
                        $this->emitToken(array(
1111
                            'type' => self::PARSEERROR,
1112
                            'data' => 'unexpected-EOF-after-attribute-value'
1113
                        ));
1114
                        $this->stream->unget();
1115
                        $state = 'data';
1116
1117
                    } else {
1118
                        /* Anything else
1119
                        Parse error. Reconsume the character in the before attribute
1120
                        name state. */
1121
                        $this->emitToken(array(
1122
                            'type' => self::PARSEERROR,
1123
                            'data' => 'unexpected-character-after-attribute-value'
1124
                        ));
1125
                        $this->stream->unget();
1126
                        $state = 'before attribute name';
1127
                    }
1128
                break;
1129
1130
                case 'self-closing start tag':
1131
                    /* Consume the next input character: */
1132
                    $char = $this->stream->char();
1133
1134
                    if ($char === '>') {
1135
                        /* U+003E GREATER-THAN SIGN (>)
1136
                        Set the self-closing flag of the current tag token.
1137
                        Emit the current tag token. Switch to the data state. */
1138
                        // not sure if this is the name we want
1139
                        $this->token['self-closing'] = true;
1140
                        $this->emitToken($this->token);
1141
                        $state = 'data';
1142
1143
                    } elseif ($char === false) {
1144
                        /* EOF
1145
                        Parse error. Reconsume the EOF character in the data state. */
1146
                        $this->emitToken(array(
1147
                            'type' => self::PARSEERROR,
1148
                            'data' => 'unexpected-eof-after-self-closing'
1149
                        ));
1150
                        $this->stream->unget();
1151
                        $state = 'data';
1152
1153
                    } else {
1154
                        /* Anything else
1155
                        Parse error. Reconsume the character in the before attribute name state. */
1156
                        $this->emitToken(array(
1157
                            'type' => self::PARSEERROR,
1158
                            'data' => 'unexpected-character-after-self-closing'
1159
                        ));
1160
                        $this->stream->unget();
1161
                        $state = 'before attribute name';
1162
                    }
1163
                break;
1164
1165
                case 'bogus comment':
1166
                    /* (This can only happen if the content model flag is set to the PCDATA state.) */
1167
                    /* Consume every character up to the first U+003E GREATER-THAN SIGN
1168
                    character (>) or the end of the file (EOF), whichever comes first. Emit
1169
                    a comment token whose data is the concatenation of all the characters
1170
                    starting from and including the character that caused the state machine
1171
                    to switch into the bogus comment state, up to and including the last
1172
                    consumed character before the U+003E character, if any, or up to the
1173
                    end of the file otherwise. (If the comment was started by the end of
1174
                    the file (EOF), the token is empty.) */
1175
                    $this->token['data'] .= (string) $this->stream->charsUntil('>');
1176
                    $this->stream->char();
1177
1178
                    $this->emitToken($this->token);
1179
1180
                    /* Switch to the data state. */
1181
                    $state = 'data';
1182
                break;
1183
1184
                case 'markup declaration open':
1185
                    // Consume for below
1186
                    $hyphens = $this->stream->charsWhile('-', 2);
1187
                    if ($hyphens === '-') {
1188
                        $this->stream->unget();
1189
                    }
1190
                    if ($hyphens !== '--') {
1191
                        $alpha = $this->stream->charsWhile(self::ALPHA, 7);
1192
                    }
1193
1194
                    /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1195
                    characters, consume those two characters, create a comment token whose
1196
                    data is the empty string, and switch to the comment state. */
1197
                    if ($hyphens === '--') {
1198
                        $state = 'comment start';
1199
                        $this->token = array(
1200
                            'data' => '',
1201
                            'type' => self::COMMENT
1202
                        );
1203
1204
                    /* Otherwise if the next seven characters are a case-insensitive match
1205
                    for the word "DOCTYPE", then consume those characters and switch to the
1206
                    DOCTYPE state. */
1207
                    } elseif (strtoupper($alpha) === 'DOCTYPE') {
1208
                        $state = 'DOCTYPE';
1209
1210
                    // XXX not implemented
1211
                    /* Otherwise, if the insertion mode is "in foreign content"
1212
                    and the current node is not an element in the HTML namespace
1213
                    and the next seven characters are an ASCII case-sensitive
1214
                    match for the string "[CDATA[" (the five uppercase letters
1215
                    "CDATA" with a U+005B LEFT SQUARE BRACKET character before
1216
                    and after), then consume those characters and switch to the
1217
                    CDATA section state (which is unrelated to the content model
1218
                    flag's CDATA state). */
1219
1220
                    /* Otherwise, is is a parse error. Switch to the bogus comment state.
1221
                    The next character that is consumed, if any, is the first character
1222
                    that will be in the comment. */
1223
                    } else {
1224
                        $this->emitToken(array(
1225
                            'type' => self::PARSEERROR,
1226
                            'data' => 'expected-dashes-or-doctype'
1227
                        ));
1228
                        $this->token = array(
1229
                            'data' => (string) $alpha,
1230
                            'type' => self::COMMENT
1231
                        );
1232
                        $state = 'bogus comment';
1233
                    }
1234
                break;
1235
1236
                case 'comment start':
1237
                    /* Consume the next input character: */
1238
                    $char = $this->stream->char();
1239
1240
                    if ($char === '-') {
1241
                        /* U+002D HYPHEN-MINUS (-)
1242
                        Switch to the comment start dash state. */
1243
                        $state = 'comment start dash';
1244
                    } elseif ($char === '>') {
1245
                        /* U+003E GREATER-THAN SIGN (>)
1246
                        Parse error. Emit the comment token. Switch to the
1247
                        data state. */
1248
                        $this->emitToken(array(
1249
                            'type' => self::PARSEERROR,
1250
                            'data' => 'incorrect-comment'
1251
                        ));
1252
                        $this->emitToken($this->token);
1253
                        $state = 'data';
1254
                    } elseif ($char === false) {
1255
                        /* EOF
1256
                        Parse error. Emit the comment token. Reconsume the
1257
                        EOF character in the data state. */
1258
                        $this->emitToken(array(
1259
                            'type' => self::PARSEERROR,
1260
                            'data' => 'eof-in-comment'
1261
                        ));
1262
                        $this->emitToken($this->token);
1263
                        $this->stream->unget();
1264
                        $state = 'data';
1265
                    } else {
1266
                        /* Anything else
1267
                        Append the input character to the comment token's
1268
                        data. Switch to the comment state. */
1269
                        $this->token['data'] .= $char;
1270
                        $state = 'comment';
1271
                    }
1272
                break;
1273
1274
                case 'comment start dash':
1275
                    /* Consume the next input character: */
1276
                    $char = $this->stream->char();
1277
                    if ($char === '-') {
1278
                        /* U+002D HYPHEN-MINUS (-)
1279
                        Switch to the comment end state */
1280
                        $state = 'comment end';
1281
                    } elseif ($char === '>') {
1282
                        /* U+003E GREATER-THAN SIGN (>)
1283
                        Parse error. Emit the comment token. Switch to the
1284
                        data state. */
1285
                        $this->emitToken(array(
1286
                            'type' => self::PARSEERROR,
1287
                            'data' => 'incorrect-comment'
1288
                        ));
1289
                        $this->emitToken($this->token);
1290
                        $state = 'data';
1291
                    } elseif ($char === false) {
1292
                        /* Parse error. Emit the comment token. Reconsume the
1293
                        EOF character in the data state. */
1294
                        $this->emitToken(array(
1295
                            'type' => self::PARSEERROR,
1296
                            'data' => 'eof-in-comment'
1297
                        ));
1298
                        $this->emitToken($this->token);
1299
                        $this->stream->unget();
1300
                        $state = 'data';
1301
                    } else {
1302
                        $this->token['data'] .= '-' . $char;
1303
                        $state = 'comment';
1304
                    }
1305
                break;
1306
1307
                case 'comment':
1308
                    /* Consume the next input character: */
1309
                    $char = $this->stream->char();
1310
1311
                    if ($char === '-') {
1312
                        /* U+002D HYPHEN-MINUS (-)
1313
                        Switch to the comment end dash state */
1314
                        $state = 'comment end dash';
1315
1316
                    } elseif ($char === false) {
1317
                        /* EOF
1318
                        Parse error. Emit the comment token. Reconsume the EOF character
1319
                        in the data state. */
1320
                        $this->emitToken(array(
1321
                            'type' => self::PARSEERROR,
1322
                            'data' => 'eof-in-comment'
1323
                        ));
1324
                        $this->emitToken($this->token);
1325
                        $this->stream->unget();
1326
                        $state = 'data';
1327
1328
                    } else {
1329
                        /* Anything else
1330
                        Append the input character to the comment token's data. Stay in
1331
                        the comment state. */
1332
                        $chars = $this->stream->charsUntil('-');
1333
1334
                        $this->token['data'] .= $char . $chars;
1335
                    }
1336
                break;
1337
1338
                case 'comment end dash':
1339
                    /* Consume the next input character: */
1340
                    $char = $this->stream->char();
1341
1342
                    if ($char === '-') {
1343
                        /* U+002D HYPHEN-MINUS (-)
1344
                        Switch to the comment end state  */
1345
                        $state = 'comment end';
1346
1347
                    } elseif ($char === false) {
1348
                        /* EOF
1349
                        Parse error. Emit the comment token. Reconsume the EOF character
1350
                        in the data state. */
1351
                        $this->emitToken(array(
1352
                            'type' => self::PARSEERROR,
1353
                            'data' => 'eof-in-comment-end-dash'
1354
                        ));
1355
                        $this->emitToken($this->token);
1356
                        $this->stream->unget();
1357
                        $state = 'data';
1358
1359
                    } else {
1360
                        /* Anything else
1361
                        Append a U+002D HYPHEN-MINUS (-) character and the input
1362
                        character to the comment token's data. Switch to the comment state. */
1363
                        $this->token['data'] .= '-'.$char;
1364
                        $state = 'comment';
1365
                    }
1366
                break;
1367
1368
                case 'comment end':
1369
                    /* Consume the next input character: */
1370
                    $char = $this->stream->char();
1371
1372
                    if ($char === '>') {
1373
                        /* U+003E GREATER-THAN SIGN (>)
1374
                        Emit the comment token. Switch to the data state. */
1375
                        $this->emitToken($this->token);
1376
                        $state = 'data';
1377
1378
                    } elseif ($char === '-') {
1379
                        /* U+002D HYPHEN-MINUS (-)
1380
                        Parse error. Append a U+002D HYPHEN-MINUS (-) character
1381
                        to the comment token's data. Stay in the comment end
1382
                        state. */
1383
                        $this->emitToken(array(
1384
                            'type' => self::PARSEERROR,
1385
                            'data' => 'unexpected-dash-after-double-dash-in-comment'
1386
                        ));
1387
                        $this->token['data'] .= '-';
1388
1389
                    } elseif ($char === "\t" || $char === "\n" || $char === "\x0a" || $char === ' ') {
1390
                        $this->emitToken(array(
1391
                            'type' => self::PARSEERROR,
1392
                            'data' => 'unexpected-space-after-double-dash-in-comment'
1393
                        ));
1394
                        $this->token['data'] .= '--' . $char;
1395
                        $state = 'comment end space';
1396
1397
                    } elseif ($char === '!') {
1398
                        $this->emitToken(array(
1399
                            'type' => self::PARSEERROR,
1400
                            'data' => 'unexpected-bang-after-double-dash-in-comment'
1401
                        ));
1402
                        $state = 'comment end bang';
1403
1404
                    } elseif ($char === false) {
1405
                        /* EOF
1406
                        Parse error. Emit the comment token. Reconsume the
1407
                        EOF character in the data state. */
1408
                        $this->emitToken(array(
1409
                            'type' => self::PARSEERROR,
1410
                            'data' => 'eof-in-comment-double-dash'
1411
                        ));
1412
                        $this->emitToken($this->token);
1413
                        $this->stream->unget();
1414
                        $state = 'data';
1415
1416
                    } else {
1417
                        /* Anything else
1418
                        Parse error. Append two U+002D HYPHEN-MINUS (-)
1419
                        characters and the input character to the comment token's
1420
                        data. Switch to the comment state. */
1421
                        $this->emitToken(array(
1422
                            'type' => self::PARSEERROR,
1423
                            'data' => 'unexpected-char-in-comment'
1424
                        ));
1425
                        $this->token['data'] .= '--'.$char;
1426
                        $state = 'comment';
1427
                    }
1428
                break;
1429
1430
                case 'comment end bang':
1431
                    $char = $this->stream->char();
1432
                    if ($char === '>') {
1433
                        $this->emitToken($this->token);
1434
                        $state = 'data';
1435
                    } elseif ($char === "-") {
1436
                        $this->token['data'] .= '--!';
1437
                        $state = 'comment end dash';
1438
                    } elseif ($char === false) {
1439
                        $this->emitToken(array(
1440
                            'type' => self::PARSEERROR,
1441
                            'data' => 'eof-in-comment-end-bang'
1442
                        ));
1443
                        $this->emitToken($this->token);
1444
                        $this->stream->unget();
1445
                        $state = 'data';
1446
                    } else {
1447
                        $this->token['data'] .= '--!' . $char;
1448
                        $state = 'comment';
1449
                    }
1450
                break;
1451
1452
                case 'comment end space':
1453
                    $char = $this->stream->char();
1454
                    if ($char === '>') {
1455
                        $this->emitToken($this->token);
1456
                        $state = 'data';
1457
                    } elseif ($char === '-') {
1458
                        $state = 'comment end dash';
1459
                    } elseif ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1460
                        $this->token['data'] .= $char;
1461
                    } elseif ($char === false) {
1462
                        $this->emitToken(array(
1463
                            'type' => self::PARSEERROR,
1464
                            'data' => 'unexpected-eof-in-comment-end-space',
1465
                        ));
1466
                        $this->emitToken($this->token);
1467
                        $this->stream->unget();
1468
                        $state = 'data';
1469
                    } else {
1470
                        $this->token['data'] .= $char;
1471
                        $state = 'comment';
1472
                    }
1473
                break;
1474
1475
                case 'DOCTYPE':
1476
                    /* Consume the next input character: */
1477
                    $char = $this->stream->char();
1478
1479
                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1480
                        /* U+0009 CHARACTER TABULATION
1481
                           U+000A LINE FEED (LF)
1482
                           U+000C FORM FEED (FF)
1483
                           U+0020 SPACE
1484
                        Switch to the before DOCTYPE name state. */
1485
                        $state = 'before DOCTYPE name';
1486
1487
                    } elseif ($char === false) {
1488
                        /* EOF
1489
                        Parse error. Create a new DOCTYPE token. Set its
1490
                        force-quirks flag to on. Emit the token. Reconsume the
1491
                        EOF character in the data state. */
1492
                        $this->emitToken(array(
1493
                            'type' => self::PARSEERROR,
1494
                            'data' => 'need-space-after-doctype-but-got-eof'
1495
                        ));
1496
                        $this->emitToken(array(
1497
                            'name' => '',
1498
                            'type' => self::DOCTYPE,
1499
                            'force-quirks' => true,
1500
                            'error' => true
1501
                        ));
1502
                        $this->stream->unget();
1503
                        $state = 'data';
1504
1505
                    } else {
1506
                        /* Anything else
1507
                        Parse error. Reconsume the current character in the
1508
                        before DOCTYPE name state. */
1509
                        $this->emitToken(array(
1510
                            'type' => self::PARSEERROR,
1511
                            'data' => 'need-space-after-doctype'
1512
                        ));
1513
                        $this->stream->unget();
1514
                        $state = 'before DOCTYPE name';
1515
                    }
1516
                break;
1517
1518
                case 'before DOCTYPE name':
1519
                    /* Consume the next input character: */
1520
                    $char = $this->stream->char();
1521
1522
                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1523
                        /* U+0009 CHARACTER TABULATION
1524
                           U+000A LINE FEED (LF)
1525
                           U+000C FORM FEED (FF)
1526
                           U+0020 SPACE
1527
                        Stay in the before DOCTYPE name state. */
1528
1529
                    } elseif ($char === '>') {
1530
                        /* U+003E GREATER-THAN SIGN (>)
1531
                        Parse error. Create a new DOCTYPE token. Set its
1532
                        force-quirks flag to on. Emit the token. Switch to the
1533
                        data state. */
1534
                        $this->emitToken(array(
1535
                            'type' => self::PARSEERROR,
1536
                            'data' => 'expected-doctype-name-but-got-right-bracket'
1537
                        ));
1538
                        $this->emitToken(array(
1539
                            'name' => '',
1540
                            'type' => self::DOCTYPE,
1541
                            'force-quirks' => true,
1542
                            'error' => true
1543
                        ));
1544
1545
                        $state = 'data';
1546
1547
                    } elseif ('A' <= $char && $char <= 'Z') {
1548
                        /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1549
                        Create a new DOCTYPE token. Set the token's name to the
1550
                        lowercase version of the input character (add 0x0020 to
1551
                        the character's code point). Switch to the DOCTYPE name
1552
                        state. */
1553
                        $this->token = array(
1554
                            'name' => strtolower($char),
1555
                            'type' => self::DOCTYPE,
1556
                            'error' => true
1557
                        );
1558
1559
                        $state = 'DOCTYPE name';
1560
1561
                    } elseif ($char === false) {
1562
                        /* EOF
1563
                        Parse error. Create a new DOCTYPE token. Set its
1564
                        force-quirks flag to on. Emit the token. Reconsume the
1565
                        EOF character in the data state. */
1566
                        $this->emitToken(array(
1567
                            'type' => self::PARSEERROR,
1568
                            'data' => 'expected-doctype-name-but-got-eof'
1569
                        ));
1570
                        $this->emitToken(array(
1571
                            'name' => '',
1572
                            'type' => self::DOCTYPE,
1573
                            'force-quirks' => true,
1574
                            'error' => true
1575
                        ));
1576
1577
                        $this->stream->unget();
1578
                        $state = 'data';
1579
1580
                    } else {
1581
                        /* Anything else
1582
                        Create a new DOCTYPE token. Set the token's name to the
1583
                        current input character. Switch to the DOCTYPE name state. */
1584
                        $this->token = array(
1585
                            'name' => $char,
1586
                            'type' => self::DOCTYPE,
1587
                            'error' => true
1588
                        );
1589
1590
                        $state = 'DOCTYPE name';
1591
                    }
1592
                break;
1593
1594
                case 'DOCTYPE name':
1595
                    /* Consume the next input character: */
1596
                    $char = $this->stream->char();
1597
1598
                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1599
                        /* U+0009 CHARACTER TABULATION
1600
                           U+000A LINE FEED (LF)
1601
                           U+000C FORM FEED (FF)
1602
                           U+0020 SPACE
1603
                        Switch to the after DOCTYPE name state. */
1604
                        $state = 'after DOCTYPE name';
1605
1606
                    } elseif ($char === '>') {
1607
                        /* U+003E GREATER-THAN SIGN (>)
1608
                        Emit the current DOCTYPE token. Switch to the data state. */
1609
                        $this->emitToken($this->token);
1610
                        $state = 'data';
1611
1612
                    } elseif ('A' <= $char && $char <= 'Z') {
1613
                        /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
1614
                        Append the lowercase version of the input character
1615
                        (add 0x0020 to the character's code point) to the current
1616
                        DOCTYPE token's name. Stay in the DOCTYPE name state. */
1617
                        $this->token['name'] .= strtolower($char);
1618
1619
                    } elseif ($char === false) {
1620
                        /* EOF
1621
                        Parse error. Set the DOCTYPE token's force-quirks flag
1622
                        to on. Emit that DOCTYPE token. Reconsume the EOF
1623
                        character in the data state. */
1624
                        $this->emitToken(array(
1625
                            'type' => self::PARSEERROR,
1626
                            'data' => 'eof-in-doctype-name'
1627
                        ));
1628
                        $this->token['force-quirks'] = true;
1629
                        $this->emitToken($this->token);
1630
                        $this->stream->unget();
1631
                        $state = 'data';
1632
1633
                    } else {
1634
                        /* Anything else
1635
                        Append the current input character to the current
1636
                        DOCTYPE token's name. Stay in the DOCTYPE name state. */
1637
                        $this->token['name'] .= $char;
1638
                    }
1639
1640
                    // XXX this is probably some sort of quirks mode designation,
1641
                    // check tree-builder to be sure. In general 'error' needs
1642
                    // to be specc'ified, this probably means removing it at the end
1643
                    $this->token['error'] = ($this->token['name'] === 'HTML')
1644
                        ? false
1645
                        : true;
1646
                break;
1647
1648
                case 'after DOCTYPE name':
1649
                    /* Consume the next input character: */
1650
                    $char = $this->stream->char();
1651
1652
                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1653
                        /* U+0009 CHARACTER TABULATION
1654
                           U+000A LINE FEED (LF)
1655
                           U+000C FORM FEED (FF)
1656
                           U+0020 SPACE
1657
                        Stay in the after DOCTYPE name state. */
1658
1659
                    } elseif ($char === '>') {
1660
                        /* U+003E GREATER-THAN SIGN (>)
1661
                        Emit the current DOCTYPE token. Switch to the data state. */
1662
                        $this->emitToken($this->token);
1663
                        $state = 'data';
1664
1665
                    } elseif ($char === false) {
1666
                        /* EOF
1667
                        Parse error. Set the DOCTYPE token's force-quirks flag
1668
                        to on. Emit that DOCTYPE token. Reconsume the EOF
1669
                        character in the data state. */
1670
                        $this->emitToken(array(
1671
                            'type' => self::PARSEERROR,
1672
                            'data' => 'eof-in-doctype'
1673
                        ));
1674
                        $this->token['force-quirks'] = true;
1675
                        $this->emitToken($this->token);
1676
                        $this->stream->unget();
1677
                        $state = 'data';
1678
1679
                    } else {
1680
                        /* Anything else */
1681
1682
                        $nextSix = strtoupper($char . $this->stream->charsWhile(self::ALPHA, 5));
1683
                        if ($nextSix === 'PUBLIC') {
1684
                            /* If the next six characters are an ASCII
1685
                            case-insensitive match for the word "PUBLIC", then
1686
                            consume those characters and switch to the before
1687
                            DOCTYPE public identifier state. */
1688
                            $state = 'before DOCTYPE public identifier';
1689
1690
                        } elseif ($nextSix === 'SYSTEM') {
1691
                            /* Otherwise, if the next six characters are an ASCII
1692
                            case-insensitive match for the word "SYSTEM", then
1693
                            consume those characters and switch to the before
1694
                            DOCTYPE system identifier state. */
1695
                            $state = 'before DOCTYPE system identifier';
1696
1697
                        } else {
1698
                            /* Otherwise, this is the parse error. Set the DOCTYPE
1699
                            token's force-quirks flag to on. Switch to the bogus
1700
                            DOCTYPE state. */
1701
                            $this->emitToken(array(
1702
                                'type' => self::PARSEERROR,
1703
                                'data' => 'expected-space-or-right-bracket-in-doctype'
1704
                            ));
1705
                            $this->token['force-quirks'] = true;
1706
                            $this->token['error'] = true;
1707
                            $state = 'bogus DOCTYPE';
1708
                        }
1709
                    }
1710
                break;
1711
1712
                case 'before DOCTYPE public identifier':
1713
                    /* Consume the next input character: */
1714
                    $char = $this->stream->char();
1715
1716
                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1717
                        /* U+0009 CHARACTER TABULATION
1718
                           U+000A LINE FEED (LF)
1719
                           U+000C FORM FEED (FF)
1720
                           U+0020 SPACE
1721
                        Stay in the before DOCTYPE public identifier state. */
1722
                    } elseif ($char === '"') {
1723
                        /* U+0022 QUOTATION MARK (")
1724
                        Set the DOCTYPE token's public identifier to the empty
1725
                        string (not missing), then switch to the DOCTYPE public
1726
                        identifier (double-quoted) state. */
1727
                        $this->token['public'] = '';
1728
                        $state = 'DOCTYPE public identifier (double-quoted)';
1729
                    } elseif ($char === "'") {
1730
                        /* U+0027 APOSTROPHE (')
1731
                        Set the DOCTYPE token's public identifier to the empty
1732
                        string (not missing), then switch to the DOCTYPE public
1733
                        identifier (single-quoted) state. */
1734
                        $this->token['public'] = '';
1735
                        $state = 'DOCTYPE public identifier (single-quoted)';
1736
                    } elseif ($char === '>') {
1737
                        /* Parse error. Set the DOCTYPE token's force-quirks flag
1738
                        to on. Emit that DOCTYPE token. Switch to the data state. */
1739
                        $this->emitToken(array(
1740
                            'type' => self::PARSEERROR,
1741
                            'data' => 'unexpected-end-of-doctype'
1742
                        ));
1743
                        $this->token['force-quirks'] = true;
1744
                        $this->emitToken($this->token);
1745
                        $state = 'data';
1746
                    } elseif ($char === false) {
1747
                        /* Parse error. Set the DOCTYPE token's force-quirks
1748
                        flag to on. Emit that DOCTYPE token. Reconsume the EOF
1749
                        character in the data state. */
1750
                        $this->emitToken(array(
1751
                            'type' => self::PARSEERROR,
1752
                            'data' => 'eof-in-doctype'
1753
                        ));
1754
                        $this->token['force-quirks'] = true;
1755
                        $this->emitToken($this->token);
1756
                        $this->stream->unget();
1757
                        $state = 'data';
1758
                    } else {
1759
                        /* Parse error. Set the DOCTYPE token's force-quirks flag
1760
                        to on. Switch to the bogus DOCTYPE state. */
1761
                        $this->emitToken(array(
1762
                            'type' => self::PARSEERROR,
1763
                            'data' => 'unexpected-char-in-doctype'
1764
                        ));
1765
                        $this->token['force-quirks'] = true;
1766
                        $state = 'bogus DOCTYPE';
1767
                    }
1768
                break;
1769
1770
                case 'DOCTYPE public identifier (double-quoted)':
1771
                    /* Consume the next input character: */
1772
                    $char = $this->stream->char();
1773
1774
                    if ($char === '"') {
1775
                        /* U+0022 QUOTATION MARK (")
1776
                        Switch to the after DOCTYPE public identifier state. */
1777
                        $state = 'after DOCTYPE public identifier';
1778
                    } elseif ($char === '>') {
1779
                        /* U+003E GREATER-THAN SIGN (>)
1780
                        Parse error. Set the DOCTYPE token's force-quirks flag
1781
                        to on. Emit that DOCTYPE token. Switch to the data state. */
1782
                        $this->emitToken(array(
1783
                            'type' => self::PARSEERROR,
1784
                            'data' => 'unexpected-end-of-doctype'
1785
                        ));
1786
                        $this->token['force-quirks'] = true;
1787
                        $this->emitToken($this->token);
1788
                        $state = 'data';
1789
                    } elseif ($char === false) {
1790
                        /* EOF
1791
                        Parse error. Set the DOCTYPE token's force-quirks flag
1792
                        to on. Emit that DOCTYPE token. Reconsume the EOF
1793
                        character in the data state. */
1794
                        $this->emitToken(array(
1795
                            'type' => self::PARSEERROR,
1796
                            'data' => 'eof-in-doctype'
1797
                        ));
1798
                        $this->token['force-quirks'] = true;
1799
                        $this->emitToken($this->token);
1800
                        $this->stream->unget();
1801
                        $state = 'data';
1802
                    } else {
1803
                        /* Anything else
1804
                        Append the current input character to the current
1805
                        DOCTYPE token's public identifier. Stay in the DOCTYPE
1806
                        public identifier (double-quoted) state. */
1807
                        $this->token['public'] .= $char;
1808
                    }
1809
                break;
1810
1811
                case 'DOCTYPE public identifier (single-quoted)':
1812
                    /* Consume the next input character: */
1813
                    $char = $this->stream->char();
1814
1815
                    if ($char === "'") {
1816
                        /* U+0027 APOSTROPHE (')
1817
                        Switch to the after DOCTYPE public identifier state. */
1818
                        $state = 'after DOCTYPE public identifier';
1819
                    } elseif ($char === '>') {
1820
                        /* U+003E GREATER-THAN SIGN (>)
1821
                        Parse error. Set the DOCTYPE token's force-quirks flag
1822
                        to on. Emit that DOCTYPE token. Switch to the data state. */
1823
                        $this->emitToken(array(
1824
                            'type' => self::PARSEERROR,
1825
                            'data' => 'unexpected-end-of-doctype'
1826
                        ));
1827
                        $this->token['force-quirks'] = true;
1828
                        $this->emitToken($this->token);
1829
                        $state = 'data';
1830
                    } elseif ($char === false) {
1831
                        /* EOF
1832
                        Parse error. Set the DOCTYPE token's force-quirks flag
1833
                        to on. Emit that DOCTYPE token. Reconsume the EOF
1834
                        character in the data state. */
1835
                        $this->emitToken(array(
1836
                            'type' => self::PARSEERROR,
1837
                            'data' => 'eof-in-doctype'
1838
                        ));
1839
                        $this->token['force-quirks'] = true;
1840
                        $this->emitToken($this->token);
1841
                        $this->stream->unget();
1842
                        $state = 'data';
1843
                    } else {
1844
                        /* Anything else
1845
                        Append the current input character to the current
1846
                        DOCTYPE token's public identifier. Stay in the DOCTYPE
1847
                        public identifier (double-quoted) state. */
1848
                        $this->token['public'] .= $char;
1849
                    }
1850
                break;
1851
1852
                case 'after DOCTYPE public identifier':
1853
                    /* Consume the next input character: */
1854
                    $char = $this->stream->char();
1855
1856
                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1857
                        /* U+0009 CHARACTER TABULATION
1858
                           U+000A LINE FEED (LF)
1859
                           U+000C FORM FEED (FF)
1860
                           U+0020 SPACE
1861
                        Stay in the after DOCTYPE public identifier state. */
1862
                    } elseif ($char === '"') {
1863
                        /* U+0022 QUOTATION MARK (")
1864
                        Set the DOCTYPE token's system identifier to the
1865
                        empty string (not missing), then switch to the DOCTYPE
1866
                        system identifier (double-quoted) state. */
1867
                        $this->token['system'] = '';
1868
                        $state = 'DOCTYPE system identifier (double-quoted)';
1869
                    } elseif ($char === "'") {
1870
                        /* U+0027 APOSTROPHE (')
1871
                        Set the DOCTYPE token's system identifier to the
1872
                        empty string (not missing), then switch to the DOCTYPE
1873
                        system identifier (single-quoted) state. */
1874
                        $this->token['system'] = '';
1875
                        $state = 'DOCTYPE system identifier (single-quoted)';
1876
                    } elseif ($char === '>') {
1877
                        /* U+003E GREATER-THAN SIGN (>)
1878
                        Emit the current DOCTYPE token. Switch to the data state. */
1879
                        $this->emitToken($this->token);
1880
                        $state = 'data';
1881
                    } elseif ($char === false) {
1882
                        /* Parse error. Set the DOCTYPE token's force-quirks
1883
                        flag to on. Emit that DOCTYPE token. Reconsume the EOF
1884
                        character in the data state. */
1885
                        $this->emitToken(array(
1886
                            'type' => self::PARSEERROR,
1887
                            'data' => 'eof-in-doctype'
1888
                        ));
1889
                        $this->token['force-quirks'] = true;
1890
                        $this->emitToken($this->token);
1891
                        $this->stream->unget();
1892
                        $state = 'data';
1893
                    } else {
1894
                        /* Anything else
1895
                        Parse error. Set the DOCTYPE token's force-quirks flag
1896
                        to on. Switch to the bogus DOCTYPE state. */
1897
                        $this->emitToken(array(
1898
                            'type' => self::PARSEERROR,
1899
                            'data' => 'unexpected-char-in-doctype'
1900
                        ));
1901
                        $this->token['force-quirks'] = true;
1902
                        $state = 'bogus DOCTYPE';
1903
                    }
1904
                break;
1905
1906
                case 'before DOCTYPE system identifier':
1907
                    /* Consume the next input character: */
1908
                    $char = $this->stream->char();
1909
1910
                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
1911
                        /* U+0009 CHARACTER TABULATION
1912
                           U+000A LINE FEED (LF)
1913
                           U+000C FORM FEED (FF)
1914
                           U+0020 SPACE
1915
                        Stay in the before DOCTYPE system identifier state. */
1916
                    } elseif ($char === '"') {
1917
                        /* U+0022 QUOTATION MARK (")
1918
                        Set the DOCTYPE token's system identifier to the empty
1919
                        string (not missing), then switch to the DOCTYPE system
1920
                        identifier (double-quoted) state. */
1921
                        $this->token['system'] = '';
1922
                        $state = 'DOCTYPE system identifier (double-quoted)';
1923
                    } elseif ($char === "'") {
1924
                        /* U+0027 APOSTROPHE (')
1925
                        Set the DOCTYPE token's system identifier to the empty
1926
                        string (not missing), then switch to the DOCTYPE system
1927
                        identifier (single-quoted) state. */
1928
                        $this->token['system'] = '';
1929
                        $state = 'DOCTYPE system identifier (single-quoted)';
1930
                    } elseif ($char === '>') {
1931
                        /* Parse error. Set the DOCTYPE token's force-quirks flag
1932
                        to on. Emit that DOCTYPE token. Switch to the data state. */
1933
                        $this->emitToken(array(
1934
                            'type' => self::PARSEERROR,
1935
                            'data' => 'unexpected-char-in-doctype'
1936
                        ));
1937
                        $this->token['force-quirks'] = true;
1938
                        $this->emitToken($this->token);
1939
                        $state = 'data';
1940
                    } elseif ($char === false) {
1941
                        /* Parse error. Set the DOCTYPE token's force-quirks
1942
                        flag to on. Emit that DOCTYPE token. Reconsume the EOF
1943
                        character in the data state. */
1944
                        $this->emitToken(array(
1945
                            'type' => self::PARSEERROR,
1946
                            'data' => 'eof-in-doctype'
1947
                        ));
1948
                        $this->token['force-quirks'] = true;
1949
                        $this->emitToken($this->token);
1950
                        $this->stream->unget();
1951
                        $state = 'data';
1952
                    } else {
1953
                        /* Parse error. Set the DOCTYPE token's force-quirks flag
1954
                        to on. Switch to the bogus DOCTYPE state. */
1955
                        $this->emitToken(array(
1956
                            'type' => self::PARSEERROR,
1957
                            'data' => 'unexpected-char-in-doctype'
1958
                        ));
1959
                        $this->token['force-quirks'] = true;
1960
                        $state = 'bogus DOCTYPE';
1961
                    }
1962
                break;
1963
1964
                case 'DOCTYPE system identifier (double-quoted)':
1965
                    /* Consume the next input character: */
1966
                    $char = $this->stream->char();
1967
1968
                    if ($char === '"') {
1969
                        /* U+0022 QUOTATION MARK (")
1970
                        Switch to the after DOCTYPE system identifier state. */
1971
                        $state = 'after DOCTYPE system identifier';
1972
                    } elseif ($char === '>') {
1973
                        /* U+003E GREATER-THAN SIGN (>)
1974
                        Parse error. Set the DOCTYPE token's force-quirks flag
1975
                        to on. Emit that DOCTYPE token. Switch to the data state. */
1976
                        $this->emitToken(array(
1977
                            'type' => self::PARSEERROR,
1978
                            'data' => 'unexpected-end-of-doctype'
1979
                        ));
1980
                        $this->token['force-quirks'] = true;
1981
                        $this->emitToken($this->token);
1982
                        $state = 'data';
1983
                    } elseif ($char === false) {
1984
                        /* EOF
1985
                        Parse error. Set the DOCTYPE token's force-quirks flag
1986
                        to on. Emit that DOCTYPE token. Reconsume the EOF
1987
                        character in the data state. */
1988
                        $this->emitToken(array(
1989
                            'type' => self::PARSEERROR,
1990
                            'data' => 'eof-in-doctype'
1991
                        ));
1992
                        $this->token['force-quirks'] = true;
1993
                        $this->emitToken($this->token);
1994
                        $this->stream->unget();
1995
                        $state = 'data';
1996
                    } else {
1997
                        /* Anything else
1998
                        Append the current input character to the current
1999
                        DOCTYPE token's system identifier. Stay in the DOCTYPE
2000
                        system identifier (double-quoted) state. */
2001
                        $this->token['system'] .= $char;
2002
                    }
2003
                break;
2004
2005
                case 'DOCTYPE system identifier (single-quoted)':
2006
                    /* Consume the next input character: */
2007
                    $char = $this->stream->char();
2008
2009
                    if ($char === "'") {
2010
                        /* U+0027 APOSTROPHE (')
2011
                        Switch to the after DOCTYPE system identifier state. */
2012
                        $state = 'after DOCTYPE system identifier';
2013
                    } elseif ($char === '>') {
2014
                        /* U+003E GREATER-THAN SIGN (>)
2015
                        Parse error. Set the DOCTYPE token's force-quirks flag
2016
                        to on. Emit that DOCTYPE token. Switch to the data state. */
2017
                        $this->emitToken(array(
2018
                            'type' => self::PARSEERROR,
2019
                            'data' => 'unexpected-end-of-doctype'
2020
                        ));
2021
                        $this->token['force-quirks'] = true;
2022
                        $this->emitToken($this->token);
2023
                        $state = 'data';
2024
                    } elseif ($char === false) {
2025
                        /* EOF
2026
                        Parse error. Set the DOCTYPE token's force-quirks flag
2027
                        to on. Emit that DOCTYPE token. Reconsume the EOF
2028
                        character in the data state. */
2029
                        $this->emitToken(array(
2030
                            'type' => self::PARSEERROR,
2031
                            'data' => 'eof-in-doctype'
2032
                        ));
2033
                        $this->token['force-quirks'] = true;
2034
                        $this->emitToken($this->token);
2035
                        $this->stream->unget();
2036
                        $state = 'data';
2037
                    } else {
2038
                        /* Anything else
2039
                        Append the current input character to the current
2040
                        DOCTYPE token's system identifier. Stay in the DOCTYPE
2041
                        system identifier (double-quoted) state. */
2042
                        $this->token['system'] .= $char;
2043
                    }
2044
                break;
2045
2046
                case 'after DOCTYPE system identifier':
2047
                    /* Consume the next input character: */
2048
                    $char = $this->stream->char();
2049
2050
                    if ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
2051
                        /* U+0009 CHARACTER TABULATION
2052
                           U+000A LINE FEED (LF)
2053
                           U+000C FORM FEED (FF)
2054
                           U+0020 SPACE
2055
                        Stay in the after DOCTYPE system identifier state. */
2056
                    } elseif ($char === '>') {
2057
                        /* U+003E GREATER-THAN SIGN (>)
2058
                        Emit the current DOCTYPE token. Switch to the data state. */
2059
                        $this->emitToken($this->token);
2060
                        $state = 'data';
2061
                    } elseif ($char === false) {
2062
                        /* Parse error. Set the DOCTYPE token's force-quirks
2063
                        flag to on. Emit that DOCTYPE token. Reconsume the EOF
2064
                        character in the data state. */
2065
                        $this->emitToken(array(
2066
                            'type' => self::PARSEERROR,
2067
                            'data' => 'eof-in-doctype'
2068
                        ));
2069
                        $this->token['force-quirks'] = true;
2070
                        $this->emitToken($this->token);
2071
                        $this->stream->unget();
2072
                        $state = 'data';
2073
                    } else {
2074
                        /* Anything else
2075
                        Parse error. Switch to the bogus DOCTYPE state.
2076
                        (This does not set the DOCTYPE token's force-quirks
2077
                        flag to on.) */
2078
                        $this->emitToken(array(
2079
                            'type' => self::PARSEERROR,
2080
                            'data' => 'unexpected-char-in-doctype'
2081
                        ));
2082
                        $state = 'bogus DOCTYPE';
2083
                    }
2084
                break;
2085
2086
                case 'bogus DOCTYPE':
2087
                    /* Consume the next input character: */
2088
                    $char = $this->stream->char();
2089
2090
                    if ($char === '>') {
2091
                        /* U+003E GREATER-THAN SIGN (>)
2092
                        Emit the DOCTYPE token. Switch to the data state. */
2093
                        $this->emitToken($this->token);
2094
                        $state = 'data';
2095
2096
                    } elseif ($char === false) {
2097
                        /* EOF
2098
                        Emit the DOCTYPE token. Reconsume the EOF character in
2099
                        the data state. */
2100
                        $this->emitToken($this->token);
2101
                        $this->stream->unget();
2102
                        $state = 'data';
2103
2104
                    } else {
2105
                        /* Anything else
2106
                        Stay in the bogus DOCTYPE state. */
2107
                    }
2108
                break;
2109
2110
                // case 'cdataSection':
2111
            }
2112
        }
2113
    }
2114
2115
    /**
2116
     * Returns a serialized representation of the tree.
2117
     *
2118
     * @return DOMDocument|DOMNodeList
2119
     */
2120
    public function save() {
2121
        return $this->tree->save();
2122
    }
2123
2124
    /**
2125
     * @return HTML5_TreeBuilder The tree
2126
     */
2127
    public function getTree()
2128
    {
2129
        return $this->tree;
2130
    }
2131
2132
2133
    /**
2134
     * Returns the input stream.
2135
     *
2136
     * @return HTML5_InputStream
2137
     */
2138
    public function stream() {
2139
        return $this->stream;
2140
    }
2141
2142
    /**
2143
     * @param bool $allowed
2144
     * @param bool $inattr
2145
     * @return string
2146
     */
2147
    private function consumeCharacterReference($allowed = false, $inattr = false) {
2148
        // This goes quite far against spec, and is far closer to the Python
2149
        // impl., mainly because we don't do the large unconsuming the spec
2150
        // requires.
2151
2152
        // All consumed characters.
2153
        $chars = $this->stream->char();
2154
2155
        /* This section defines how to consume a character
2156
        reference. This definition is used when parsing character
2157
        references in text and in attributes.
2158
2159
        The behavior depends on the identity of the next character
2160
        (the one immediately after the U+0026 AMPERSAND character): */
2161
2162
        if (
2163
            $chars[0] === "\x09" ||
2164
            $chars[0] === "\x0A" ||
2165
            $chars[0] === "\x0C" ||
2166
            $chars[0] === "\x20" ||
2167
            $chars[0] === '<' ||
2168
            $chars[0] === '&' ||
2169
            $chars === false ||
2170
            $chars[0] === $allowed
2171
        ) {
2172
            /* U+0009 CHARACTER TABULATION
2173
               U+000A LINE FEED (LF)
2174
               U+000C FORM FEED (FF)
2175
               U+0020 SPACE
2176
               U+003C LESS-THAN SIGN
2177
               U+0026 AMPERSAND
2178
               EOF
2179
               The additional allowed character, if there is one
2180
            Not a character reference. No characters are consumed,
2181
            and nothing is returned. (This is not an error, either.) */
2182
            // We already consumed, so unconsume.
2183
            $this->stream->unget();
2184
            return '&';
2185
        } elseif ($chars[0] === '#') {
2186
            /* Consume the U+0023 NUMBER SIGN. */
2187
            // Um, yeah, we already did that.
2188
            /* The behavior further depends on the character after
2189
            the U+0023 NUMBER SIGN: */
2190
            $chars .= $this->stream->char();
2191
            if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) {
2192
                /* U+0078 LATIN SMALL LETTER X
2193
                   U+0058 LATIN CAPITAL LETTER X */
2194
                /* Consume the X. */
2195
                // Um, yeah, we already did that.
2196
                /* Follow the steps below, but using the range of
2197
                characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2198
                NINE, U+0061 LATIN SMALL LETTER A through to U+0066
2199
                LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
2200
                A, through to U+0046 LATIN CAPITAL LETTER F (in other
2201
                words, 0123456789, ABCDEF, abcdef). */
2202
                $char_class = self::HEX;
2203
                /* When it comes to interpreting the
2204
                number, interpret it as a hexadecimal number. */
2205
                $hex = true;
2206
            } else {
2207
                /* Anything else */
2208
                // Unconsume because we shouldn't have consumed this.
2209
                $chars = $chars[0];
2210
                $this->stream->unget();
2211
                /* Follow the steps below, but using the range of
2212
                characters U+0030 DIGIT ZERO through to U+0039 DIGIT
2213
                NINE (i.e. just 0123456789). */
2214
                $char_class = self::DIGIT;
2215
                /* When it comes to interpreting the number,
2216
                interpret it as a decimal number. */
2217
                $hex = false;
2218
            }
2219
2220
            /* Consume as many characters as match the range of characters given above. */
2221
            $consumed = $this->stream->charsWhile($char_class);
2222
            if ($consumed === '' || $consumed === false) {
2223
                /* If no characters match the range, then don't consume
2224
                any characters (and unconsume the U+0023 NUMBER SIGN
2225
                character and, if appropriate, the X character). This
2226
                is a parse error; nothing is returned. */
2227
                $this->emitToken(array(
2228
                    'type' => self::PARSEERROR,
2229
                    'data' => 'expected-numeric-entity'
2230
                ));
2231
                return '&' . $chars;
2232
            } else {
2233
                /* Otherwise, if the next character is a U+003B SEMICOLON,
2234
                consume that too. If it isn't, there is a parse error. */
2235
                if ($this->stream->char() !== ';') {
2236
                    $this->stream->unget();
2237
                    $this->emitToken(array(
2238
                        'type' => self::PARSEERROR,
2239
                        'data' => 'numeric-entity-without-semicolon'
2240
                    ));
2241
                }
2242
2243
                /* If one or more characters match the range, then take
2244
                them all and interpret the string of characters as a number
2245
                (either hexadecimal or decimal as appropriate). */
2246
                $codepoint = $hex ? hexdec($consumed) : (int) $consumed;
2247
2248
                /* If that number is one of the numbers in the first column
2249
                of the following table, then this is a parse error. Find the
2250
                row with that number in the first column, and return a
2251
                character token for the Unicode character given in the
2252
                second column of that row. */
2253
                $new_codepoint = HTML5_Data::getRealCodepoint($codepoint);
2254
                if ($new_codepoint) {
2255
                    $this->emitToken(array(
2256
                        'type' => self::PARSEERROR,
2257
                        'data' => 'illegal-windows-1252-entity'
2258
                    ));
2259
                    return HTML5_Data::utf8chr($new_codepoint);
2260
                } else {
2261
                    /* Otherwise, if the number is greater than 0x10FFFF, then
2262
                     * this is a parse error. Return a U+FFFD REPLACEMENT
2263
                     * CHARACTER. */
2264
                    if ($codepoint > 0x10FFFF) {
2265
                        $this->emitToken(array(
2266
                            'type' => self::PARSEERROR,
2267
                            'data' => 'overlong-character-entity' // XXX probably not correct
2268
                        ));
2269
                        return "\xEF\xBF\xBD";
2270
                    }
2271
                    /* Otherwise, return a character token for the Unicode
2272
                     * character whose code point is that number.  If the
2273
                     * number is in the range 0x0001 to 0x0008,    0x000E to
2274
                     * 0x001F,  0x007F  to 0x009F, 0xD800 to 0xDFFF, 0xFDD0 to
2275
                     * 0xFDEF, or is one of 0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
2276
                     * 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE,
2277
                     * 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
2278
                     * 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE,
2279
                     * 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
2280
                     * 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE,
2281
                     * or 0x10FFFF, then this is a parse error. */
2282
                    // && has higher precedence than ||
2283
                    if (
2284
                        $codepoint >= 0x0000 && $codepoint <= 0x0008 ||
2285
                        $codepoint === 0x000B ||
2286
                        $codepoint >= 0x000E && $codepoint <= 0x001F ||
2287
                        $codepoint >= 0x007F && $codepoint <= 0x009F ||
2288
                        $codepoint >= 0xD800 && $codepoint <= 0xDFFF ||
2289
                        $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF ||
2290
                        ($codepoint & 0xFFFE) === 0xFFFE ||
2291
                        $codepoint == 0x10FFFF || $codepoint == 0x10FFFE
2292
                    ) {
2293
                        $this->emitToken(array(
2294
                            'type' => self::PARSEERROR,
2295
                            'data' => 'illegal-codepoint-for-numeric-entity'
2296
                        ));
2297
                    }
2298
                    return HTML5_Data::utf8chr($codepoint);
2299
                }
2300
            }
2301
        } else {
2302
            /* Anything else */
2303
2304
            /* Consume the maximum number of characters possible,
2305
            with the consumed characters matching one of the
2306
            identifiers in the first column of the named character
2307
            references table (in a case-sensitive manner). */
2308
            // What we actually do here is consume as much as we can while it
2309
            // matches the start of one of the identifiers in the first column.
2310
2311
            $refs = HTML5_Data::getNamedCharacterReferences();
2312
2313
            // Get the longest string which is the start of an identifier
2314
            // ($chars) as well as the longest identifier which matches ($id)
2315
            // and its codepoint ($codepoint).
2316
            $codepoint = false;
2317
            $char = $chars;
2318
            while ($char !== false && isset($refs[$char])) {
2319
                $refs = $refs[$char];
2320
                if (isset($refs['codepoint'])) {
2321
                    $id = $chars;
2322
                    $codepoint = $refs['codepoint'];
2323
                }
2324
                $chars .= $char = $this->stream->char();
2325
            }
2326
2327
            // Unconsume the one character we just took which caused the while
2328
            // statement to fail. This could be anything and could cause state
2329
            // changes (as if it matches the while loop it must be
2330
            // alphanumeric so we can just concat it to whatever we get later).
2331
            $this->stream->unget();
2332
            if ($char !== false) {
2333
                $chars = substr($chars, 0, -1);
2334
            }
2335
2336
            /* If no match can be made, then this is a parse error.
2337
            No characters are consumed, and nothing is returned. */
2338
            if (!$codepoint) {
2339
                $this->emitToken(array(
2340
                    'type' => self::PARSEERROR,
2341
                    'data' => 'expected-named-entity'
2342
                ));
2343
                return '&' . $chars;
2344
            }
2345
2346
            /* If the last character matched is not a U+003B SEMICOLON
2347
            (;), there is a parse error. */
2348
            $semicolon = true;
2349
            if (substr($id, -1) !== ';') {
2350
                $this->emitToken(array(
2351
                    'type' => self::PARSEERROR,
2352
                    'data' => 'named-entity-without-semicolon'
2353
                ));
2354
                $semicolon = false;
2355
            }
2356
2357
            /* If the character reference is being consumed as part of
2358
            an attribute, and the last character matched is not a
2359
            U+003B SEMICOLON (;), and the next character is in the
2360
            range U+0030 DIGIT ZERO to U+0039 DIGIT NINE, U+0041
2361
            LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z,
2362
            or U+0061 LATIN SMALL LETTER A to U+007A LATIN SMALL LETTER Z,
2363
            then, for historical reasons, all the characters that were
2364
            matched after the U+0026 AMPERSAND (&) must be unconsumed,
2365
            and nothing is returned. */
2366
            if ($inattr && !$semicolon) {
2367
                // The next character is either the next character in $chars or in the stream.
2368
                if (strlen($chars) > strlen($id)) {
2369
                    $next = substr($chars, strlen($id), 1);
2370
                } else {
2371
                    $next = $this->stream->char();
2372
                    $this->stream->unget();
2373
                }
2374
                if (
2375
                    '0' <= $next && $next <= '9' ||
2376
                    'A' <= $next && $next <= 'Z' ||
2377
                    'a' <= $next && $next <= 'z'
2378
                ) {
2379
                    return '&' . $chars;
2380
                }
2381
            }
2382
2383
            /* Otherwise, return a character token for the character
2384
            corresponding to the character reference name (as given
2385
            by the second column of the named character references table). */
2386
            return HTML5_Data::utf8chr($codepoint) . substr($chars, strlen($id));
2387
        }
2388
    }
2389
2390
    /**
2391
     * @param bool $allowed
2392
     */
2393
    private function characterReferenceInAttributeValue($allowed = false) {
2394
        /* Attempt to consume a character reference. */
2395
        $entity = $this->consumeCharacterReference($allowed, true);
2396
2397
        /* If nothing is returned, append a U+0026 AMPERSAND
2398
        character to the current attribute's value.
2399
2400
        Otherwise, append the returned character token to the
2401
        current attribute's value. */
2402
        $char = (!$entity)
2403
            ? '&'
2404
            : $entity;
2405
2406
        $last = count($this->token['attr']) - 1;
2407
        $this->token['attr'][$last]['value'] .= $char;
2408
2409
        /* Finally, switch back to the attribute value state that you
2410
        were in when were switched into this state. */
2411
    }
2412
2413
    /**
2414
     * Emits a token, passing it on to the tree builder.
2415
     *
2416
     * @param $token
2417
     * @param bool $checkStream
2418
     * @param bool $dry
2419
     */
2420
    protected function emitToken($token, $checkStream = true, $dry = false) {
2421
        if ($checkStream === true) {
2422
            // Emit errors from input stream.
2423
            while ($this->stream->errors) {
2424
                $this->emitToken(array_shift($this->stream->errors), false);
2425
            }
2426
        }
2427
        if ($token['type'] === self::ENDTAG && !empty($token['attr'])) {
2428
            for ($i = 0; $i < count($token['attr']); $i++) {
2429
                $this->emitToken(array(
2430
                    'type' => self::PARSEERROR,
2431
                    'data' => 'attributes-in-end-tag'
2432
                ));
2433
            }
2434
        }
2435
        if ($token['type'] === self::ENDTAG && !empty($token['self-closing'])) {
2436
            $this->emitToken(array(
2437
                'type' => self::PARSEERROR,
2438
                'data' => 'self-closing-flag-on-end-tag',
2439
            ));
2440
        }
2441
        if ($token['type'] === self::STARTTAG) {
2442
            // This could be changed to actually pass the tree-builder a hash
2443
            $hash = array();
2444
            foreach ($token['attr'] as $keypair) {
2445
                if (isset($hash[$keypair['name']])) {
2446
                    $this->emitToken(array(
2447
                        'type' => self::PARSEERROR,
2448
                        'data' => 'duplicate-attribute',
2449
                    ));
2450
                } else {
2451
                    $hash[$keypair['name']] = $keypair['value'];
2452
                }
2453
            }
2454
        }
2455
2456
        if ($dry === false) {
2457
            // the current structure of attributes is not a terribly good one
2458
            $this->tree->emitToken($token);
2459
        }
2460
2461
        if ($dry === false && is_int($this->tree->content_model)) {
2462
            $this->content_model = $this->tree->content_model;
2463
            $this->tree->content_model = null;
2464
2465
        } elseif ($token['type'] === self::ENDTAG) {
2466
            $this->content_model = self::PCDATA;
2467
        }
2468
    }
2469
}
2470
2471