Completed
Branch development (b1b115)
by Johannes
10:28
created

HTML5TreeConstructer   F

Complexity

Total Complexity 569

Size/Duplication

Total Lines 2746
Duplicated Lines 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
wmc 569
c 1
b 0
f 0
dl 0
loc 2746
rs 0.8

How to fix   Complexity   

Complex Class

Complex classes like HTML5TreeConstructer often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use HTML5TreeConstructer, and based on these observations, apply Extract Interface, too.

1
<?php
2
class HTML5
3
{
4
    private $data;
5
    private $char;
6
    private $EOF;
7
    private $state;
8
    private $tree;
9
    private $token;
10
    private $content_model;
11
    private $escape = false;
12
    private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute',
13
    'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;',
14
    'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;',
15
    'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;',
16
    'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;',
17
    'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;',
18
    'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;',
19
    'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;',
20
    'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;',
21
    'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN',
22
    'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;',
23
    'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;',
24
    'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig',
25
    'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;',
26
    'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;',
27
    'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil',
28
    'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;',
29
    'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;',
30
    'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;',
31
    'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth',
32
    'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12',
33
    'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt',
34
    'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc',
35
    'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;',
36
    'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;',
37
    'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;',
38
    'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro',
39
    'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;',
40
    'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;',
41
    'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;',
42
    'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash',
43
    'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;',
44
    'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;',
45
    'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;',
46
    'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;',
47
    'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;',
48
    'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;',
49
    'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;',
50
    'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;',
51
    'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc',
52
    'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;',
53
    'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;');
54
55
    const PCDATA    = 0;
56
    const RCDATA    = 1;
57
    const CDATA     = 2;
58
    const PLAINTEXT = 3;
59
60
    const DOCTYPE  = 0;
61
    const STARTTAG = 1;
62
    const ENDTAG   = 2;
63
    const COMMENT  = 3;
64
    const CHARACTR = 4;
65
    const EOF      = 5;
66
67
    public function __construct($data)
68
    {
69
        $data = str_replace("\r\n", "\n", $data);
70
        $date = str_replace("\r", null, $data);
71
72
        $this->data = $data;
73
        $this->char = -1;
74
        $this->EOF  = strlen($data);
75
        $this->tree = new HTML5TreeConstructer;
76
        $this->content_model = self::PCDATA;
77
78
        $this->state = 'data';
79
80
        while($this->state !== null) {
81
            $this->{$this->state.'State'}();
82
        }
83
    }
84
85
    public function save()
86
    {
87
        return $this->tree->save();
88
    }
89
90
    private function char()
91
    {
92
        return ($this->char < $this->EOF)
93
            ? $this->data[$this->char]
94
            : false;
95
    }
96
97
    private function character($s, $l = 0)
98
    {
99
        if($s + $l < $this->EOF) {
100
            if($l === 0) {
101
                return $this->data[$s];
102
            } else {
103
                return substr($this->data, $s, $l);
104
            }
105
        }
106
    }
107
108
    private function characters($char_class, $start)
109
    {
110
        return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start));
111
    }
112
113
    private function dataState()
114
    {
115
        // Consume the next input character
116
        $this->char++;
117
        $char = $this->char();
118
119
        if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
120
            /* U+0026 AMPERSAND (&)
121
            When the content model flag is set to one of the PCDATA or RCDATA
122
            states: switch to the entity data state. Otherwise: treat it as per
123
            the "anything else"    entry below. */
124
            $this->state = 'entityData';
125
126
        } elseif($char === '-') {
127
            /* If the content model flag is set to either the RCDATA state or
128
            the CDATA state, and the escape flag is false, and there are at
129
            least three characters before this one in the input stream, and the
130
            last four characters in the input stream, including this one, are
131
            U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
132
            and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
133
            if(($this->content_model === self::RCDATA || $this->content_model ===
134
            self::CDATA) && $this->escape === false &&
135
            $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--') {
136
                $this->escape = true;
137
            }
138
139
            /* In any case, emit the input character as a character token. Stay
140
            in the data state. */
141
            $this->emitToken(array(
142
                'type' => self::CHARACTR,
143
                'data' => $char
144
            ));
145
146
        /* U+003C LESS-THAN SIGN (<) */
147
        } elseif($char === '<' && ($this->content_model === self::PCDATA ||
148
        (($this->content_model === self::RCDATA ||
149
        $this->content_model === self::CDATA) && $this->escape === false))) {
150
            /* When the content model flag is set to the PCDATA state: switch
151
            to the tag open state.
152
153
            When the content model flag is set to either the RCDATA state or
154
            the CDATA state and the escape flag is false: switch to the tag
155
            open state.
156
157
            Otherwise: treat it as per the "anything else" entry below. */
158
            $this->state = 'tagOpen';
159
160
        /* U+003E GREATER-THAN SIGN (>) */
161
        } elseif($char === '>') {
162
            /* If the content model flag is set to either the RCDATA state or
163
            the CDATA state, and the escape flag is true, and the last three
164
            characters in the input stream including this one are U+002D
165
            HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
166
            set the escape flag to false. */
167
            if(($this->content_model === self::RCDATA ||
168
            $this->content_model === self::CDATA) && $this->escape === true &&
169
            $this->character($this->char, 3) === '-->') {
170
                $this->escape = false;
171
            }
172
173
            /* In any case, emit the input character as a character token.
174
            Stay in the data state. */
175
            $this->emitToken(array(
176
                'type' => self::CHARACTR,
177
                'data' => $char
178
            ));
179
180
        } elseif($this->char === $this->EOF) {
181
            /* EOF
182
            Emit an end-of-file token. */
183
            $this->EOF();
184
185
        } elseif($this->content_model === self::PLAINTEXT) {
186
            /* When the content model flag is set to the PLAINTEXT state
187
            THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
188
            the text and emit it as a character token. */
189
            $this->emitToken(array(
190
                'type' => self::CHARACTR,
191
                'data' => substr($this->data, $this->char)
192
            ));
193
194
            $this->EOF();
195
196
        } else {
197
            /* Anything else
198
            THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
199
            otherwise would also be treated as a character token and emit it
200
            as a single character token. Stay in the data state. */
201
            $len  = strcspn($this->data, '<&', $this->char);
202
            $char = substr($this->data, $this->char, $len);
203
            $this->char += $len - 1;
204
205
            $this->emitToken(array(
206
                'type' => self::CHARACTR,
207
                'data' => $char
208
            ));
209
210
            $this->state = 'data';
211
        }
212
    }
213
214
    private function entityDataState()
215
    {
216
        // Attempt to consume an entity.
217
        $entity = $this->entity();
218
219
        // If nothing is returned, emit a U+0026 AMPERSAND character token.
220
        // Otherwise, emit the character token that was returned.
221
        $char = (!$entity) ? '&' : $entity;
222
        $this->emitToken($char);
223
224
        // Finally, switch to the data state.
225
        $this->state = 'data';
226
    }
227
228
    private function tagOpenState()
229
    {
230
        switch($this->content_model) {
231
            case self::RCDATA:
232
            case self::CDATA:
233
                /* If the next input character is a U+002F SOLIDUS (/) character,
234
                consume it and switch to the close tag open state. If the next
235
                input character is not a U+002F SOLIDUS (/) character, emit a
236
                U+003C LESS-THAN SIGN character token and switch to the data
237
                state to process the next input character. */
238
                if($this->character($this->char + 1) === '/') {
239
                    $this->char++;
240
                    $this->state = 'closeTagOpen';
241
242
                } else {
243
                    $this->emitToken(array(
244
                        'type' => self::CHARACTR,
245
                        'data' => '<'
246
                    ));
247
248
                    $this->state = 'data';
249
                }
250
            break;
251
252
            case self::PCDATA:
253
                // If the content model flag is set to the PCDATA state
254
                // Consume the next input character:
255
                $this->char++;
256
                $char = $this->char();
257
258
                if($char === '!') {
259
                    /* U+0021 EXCLAMATION MARK (!)
260
                    Switch to the markup declaration open state. */
261
                    $this->state = 'markupDeclarationOpen';
262
263
                } elseif($char === '/') {
264
                    /* U+002F SOLIDUS (/)
265
                    Switch to the close tag open state. */
266
                    $this->state = 'closeTagOpen';
267
268
                } elseif(preg_match('/^[A-Za-z]$/', $char)) {
269
                    /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
270
                    Create a new start tag token, set its tag name to the lowercase
271
                    version of the input character (add 0x0020 to the character's code
272
                    point), then switch to the tag name state. (Don't emit the token
273
                    yet; further details will be filled in before it is emitted.) */
274
                    $this->token = array(
275
                        'name'  => strtolower($char),
276
                        'type'  => self::STARTTAG,
277
                        'attr'  => array()
278
                    );
279
280
                    $this->state = 'tagName';
281
282
                } elseif($char === '>') {
283
                    /* U+003E GREATER-THAN SIGN (>)
284
                    Parse error. Emit a U+003C LESS-THAN SIGN character token and a
285
                    U+003E GREATER-THAN SIGN character token. Switch to the data state. */
286
                    $this->emitToken(array(
287
                        'type' => self::CHARACTR,
288
                        'data' => '<>'
289
                    ));
290
291
                    $this->state = 'data';
292
293
                } elseif($char === '?') {
294
                    /* U+003F QUESTION MARK (?)
295
                    Parse error. Switch to the bogus comment state. */
296
                    $this->state = 'bogusComment';
297
298
                } else {
299
                    /* Anything else
300
                    Parse error. Emit a U+003C LESS-THAN SIGN character token and
301
                    reconsume the current input character in the data state. */
302
                    $this->emitToken(array(
303
                        'type' => self::CHARACTR,
304
                        'data' => '<'
305
                    ));
306
307
                    $this->char--;
308
                    $this->state = 'data';
309
                }
310
            break;
311
        }
312
    }
313
314
    private function closeTagOpenState()
315
    {
316
        $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
317
        $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
318
319
        if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
320
        (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/',
321
        $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) {
322
            /* If the content model flag is set to the RCDATA or CDATA states then
323
            examine the next few characters. If they do not match the tag name of
324
            the last start tag token emitted (case insensitively), or if they do but
325
            they are not immediately followed by one of the following characters:
326
                * U+0009 CHARACTER TABULATION
327
                * U+000A LINE FEED (LF)
328
                * U+000B LINE TABULATION
329
                * U+000C FORM FEED (FF)
330
                * U+0020 SPACE
331
                * U+003E GREATER-THAN SIGN (>)
332
                * U+002F SOLIDUS (/)
333
                * EOF
334
            ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
335
            token, a U+002F SOLIDUS character token, and switch to the data state
336
            to process the next input character. */
337
            $this->emitToken(array(
338
                'type' => self::CHARACTR,
339
                'data' => '</'
340
            ));
341
342
            $this->state = 'data';
343
344
        } else {
345
            /* Otherwise, if the content model flag is set to the PCDATA state,
346
            or if the next few characters do match that tag name, consume the
347
            next input character: */
348
            $this->char++;
349
            $char = $this->char();
350
351
            if(preg_match('/^[A-Za-z]$/', $char)) {
352
                /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
353
                Create a new end tag token, set its tag name to the lowercase version
354
                of the input character (add 0x0020 to the character's code point), then
355
                switch to the tag name state. (Don't emit the token yet; further details
356
                will be filled in before it is emitted.) */
357
                $this->token = array(
358
                    'name'  => strtolower($char),
359
                    'type'  => self::ENDTAG
360
                );
361
362
                $this->state = 'tagName';
363
364
            } elseif($char === '>') {
365
                /* U+003E GREATER-THAN SIGN (>)
366
                Parse error. Switch to the data state. */
367
                $this->state = 'data';
368
369
            } elseif($this->char === $this->EOF) {
370
                /* EOF
371
                Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
372
                SOLIDUS character token. Reconsume the EOF character in the data state. */
373
                $this->emitToken(array(
374
                    'type' => self::CHARACTR,
375
                    'data' => '</'
376
                ));
377
378
                $this->char--;
379
                $this->state = 'data';
380
381
            } else {
382
                /* Parse error. Switch to the bogus comment state. */
383
                $this->state = 'bogusComment';
384
            }
385
        }
386
    }
387
388
    private function tagNameState()
389
    {
390
        // Consume the next input character:
391
        $this->char++;
392
        $char = $this->character($this->char);
393
394
        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
395
            /* U+0009 CHARACTER TABULATION
396
            U+000A LINE FEED (LF)
397
            U+000B LINE TABULATION
398
            U+000C FORM FEED (FF)
399
            U+0020 SPACE
400
            Switch to the before attribute name state. */
401
            $this->state = 'beforeAttributeName';
402
403
        } elseif($char === '>') {
404
            /* U+003E GREATER-THAN SIGN (>)
405
            Emit the current tag token. Switch to the data state. */
406
            $this->emitToken($this->token);
407
            $this->state = 'data';
408
409
        } elseif($this->char === $this->EOF) {
410
            /* EOF
411
            Parse error. Emit the current tag token. Reconsume the EOF
412
            character in the data state. */
413
            $this->emitToken($this->token);
414
415
            $this->char--;
416
            $this->state = 'data';
417
418
        } elseif($char === '/') {
419
            /* U+002F SOLIDUS (/)
420
            Parse error unless this is a permitted slash. Switch to the before
421
            attribute name state. */
422
            $this->state = 'beforeAttributeName';
423
424
        } else {
425
            /* Anything else
426
            Append the current input character to the current tag token's tag name.
427
            Stay in the tag name state. */
428
            $this->token['name'] .= strtolower($char);
429
            $this->state = 'tagName';
430
        }
431
    }
432
433
    private function beforeAttributeNameState()
434
    {
435
        // Consume the next input character:
436
        $this->char++;
437
        $char = $this->character($this->char);
438
439
        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
440
            /* U+0009 CHARACTER TABULATION
441
            U+000A LINE FEED (LF)
442
            U+000B LINE TABULATION
443
            U+000C FORM FEED (FF)
444
            U+0020 SPACE
445
            Stay in the before attribute name state. */
446
            $this->state = 'beforeAttributeName';
447
448
        } elseif($char === '>') {
449
            /* U+003E GREATER-THAN SIGN (>)
450
            Emit the current tag token. Switch to the data state. */
451
            $this->emitToken($this->token);
452
            $this->state = 'data';
453
454
        } elseif($char === '/') {
455
            /* U+002F SOLIDUS (/)
456
            Parse error unless this is a permitted slash. Stay in the before
457
            attribute name state. */
458
            $this->state = 'beforeAttributeName';
459
460
        } elseif($this->char === $this->EOF) {
461
            /* EOF
462
            Parse error. Emit the current tag token. Reconsume the EOF
463
            character in the data state. */
464
            $this->emitToken($this->token);
465
466
            $this->char--;
467
            $this->state = 'data';
468
469
        } else {
470
            /* Anything else
471
            Start a new attribute in the current tag token. Set that attribute's
472
            name to the current input character, and its value to the empty string.
473
            Switch to the attribute name state. */
474
            $this->token['attr'][] = array(
475
                'name'  => strtolower($char),
476
                'value' => null
477
            );
478
479
            $this->state = 'attributeName';
480
        }
481
    }
482
483
    private function attributeNameState()
484
    {
485
        // Consume the next input character:
486
        $this->char++;
487
        $char = $this->character($this->char);
488
489
        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
490
            /* U+0009 CHARACTER TABULATION
491
            U+000A LINE FEED (LF)
492
            U+000B LINE TABULATION
493
            U+000C FORM FEED (FF)
494
            U+0020 SPACE
495
            Stay in the before attribute name state. */
496
            $this->state = 'afterAttributeName';
497
498
        } elseif($char === '=') {
499
            /* U+003D EQUALS SIGN (=)
500
            Switch to the before attribute value state. */
501
            $this->state = 'beforeAttributeValue';
502
503
        } elseif($char === '>') {
504
            /* U+003E GREATER-THAN SIGN (>)
505
            Emit the current tag token. Switch to the data state. */
506
            $this->emitToken($this->token);
507
            $this->state = 'data';
508
509
        } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
510
            /* U+002F SOLIDUS (/)
511
            Parse error unless this is a permitted slash. Switch to the before
512
            attribute name state. */
513
            $this->state = 'beforeAttributeName';
514
515
        } elseif($this->char === $this->EOF) {
516
            /* EOF
517
            Parse error. Emit the current tag token. Reconsume the EOF
518
            character in the data state. */
519
            $this->emitToken($this->token);
520
521
            $this->char--;
522
            $this->state = 'data';
523
524
        } else {
525
            /* Anything else
526
            Append the current input character to the current attribute's name.
527
            Stay in the attribute name state. */
528
            $last = count($this->token['attr']) - 1;
529
            $this->token['attr'][$last]['name'] .= strtolower($char);
530
531
            $this->state = 'attributeName';
532
        }
533
    }
534
535
    private function afterAttributeNameState()
536
    {
537
        // Consume the next input character:
538
        $this->char++;
539
        $char = $this->character($this->char);
540
541
        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
542
            /* U+0009 CHARACTER TABULATION
543
            U+000A LINE FEED (LF)
544
            U+000B LINE TABULATION
545
            U+000C FORM FEED (FF)
546
            U+0020 SPACE
547
            Stay in the after attribute name state. */
548
            $this->state = 'afterAttributeName';
549
550
        } elseif($char === '=') {
551
            /* U+003D EQUALS SIGN (=)
552
            Switch to the before attribute value state. */
553
            $this->state = 'beforeAttributeValue';
554
555
        } elseif($char === '>') {
556
            /* U+003E GREATER-THAN SIGN (>)
557
            Emit the current tag token. Switch to the data state. */
558
            $this->emitToken($this->token);
559
            $this->state = 'data';
560
561
        } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
562
            /* U+002F SOLIDUS (/)
563
            Parse error unless this is a permitted slash. Switch to the
564
            before attribute name state. */
565
            $this->state = 'beforeAttributeName';
566
567
        } elseif($this->char === $this->EOF) {
568
            /* EOF
569
            Parse error. Emit the current tag token. Reconsume the EOF
570
            character in the data state. */
571
            $this->emitToken($this->token);
572
573
            $this->char--;
574
            $this->state = 'data';
575
576
        } else {
577
            /* Anything else
578
            Start a new attribute in the current tag token. Set that attribute's
579
            name to the current input character, and its value to the empty string.
580
            Switch to the attribute name state. */
581
            $this->token['attr'][] = array(
582
                'name'  => strtolower($char),
583
                'value' => null
584
            );
585
586
            $this->state = 'attributeName';
587
        }
588
    }
589
590
    private function beforeAttributeValueState()
591
    {
592
        // Consume the next input character:
593
        $this->char++;
594
        $char = $this->character($this->char);
595
596
        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
597
            /* U+0009 CHARACTER TABULATION
598
            U+000A LINE FEED (LF)
599
            U+000B LINE TABULATION
600
            U+000C FORM FEED (FF)
601
            U+0020 SPACE
602
            Stay in the before attribute value state. */
603
            $this->state = 'beforeAttributeValue';
604
605
        } elseif($char === '"') {
606
            /* U+0022 QUOTATION MARK (")
607
            Switch to the attribute value (double-quoted) state. */
608
            $this->state = 'attributeValueDoubleQuoted';
609
610
        } elseif($char === '&') {
611
            /* U+0026 AMPERSAND (&)
612
            Switch to the attribute value (unquoted) state and reconsume
613
            this input character. */
614
            $this->char--;
615
            $this->state = 'attributeValueUnquoted';
616
617
        } elseif($char === '\'') {
618
            /* U+0027 APOSTROPHE (')
619
            Switch to the attribute value (single-quoted) state. */
620
            $this->state = 'attributeValueSingleQuoted';
621
622
        } elseif($char === '>') {
623
            /* U+003E GREATER-THAN SIGN (>)
624
            Emit the current tag token. Switch to the data state. */
625
            $this->emitToken($this->token);
626
            $this->state = 'data';
627
628
        } else {
629
            /* Anything else
630
            Append the current input character to the current attribute's value.
631
            Switch to the attribute value (unquoted) state. */
632
            $last = count($this->token['attr']) - 1;
633
            $this->token['attr'][$last]['value'] .= $char;
634
635
            $this->state = 'attributeValueUnquoted';
636
        }
637
    }
638
639
    private function attributeValueDoubleQuotedState()
640
    {
641
        // Consume the next input character:
642
        $this->char++;
643
        $char = $this->character($this->char);
644
645
        if($char === '"') {
646
            /* U+0022 QUOTATION MARK (")
647
            Switch to the before attribute name state. */
648
            $this->state = 'beforeAttributeName';
649
650
        } elseif($char === '&') {
651
            /* U+0026 AMPERSAND (&)
652
            Switch to the entity in attribute value state. */
653
            $this->entityInAttributeValueState('double');
654
655
        } elseif($this->char === $this->EOF) {
656
            /* EOF
657
            Parse error. Emit the current tag token. Reconsume the character
658
            in the data state. */
659
            $this->emitToken($this->token);
660
661
            $this->char--;
662
            $this->state = 'data';
663
664
        } else {
665
            /* Anything else
666
            Append the current input character to the current attribute's value.
667
            Stay in the attribute value (double-quoted) state. */
668
            $last = count($this->token['attr']) - 1;
669
            $this->token['attr'][$last]['value'] .= $char;
670
671
            $this->state = 'attributeValueDoubleQuoted';
672
        }
673
    }
674
675
    private function attributeValueSingleQuotedState()
676
    {
677
        // Consume the next input character:
678
        $this->char++;
679
        $char = $this->character($this->char);
680
681
        if($char === '\'') {
682
            /* U+0022 QUOTATION MARK (')
683
            Switch to the before attribute name state. */
684
            $this->state = 'beforeAttributeName';
685
686
        } elseif($char === '&') {
687
            /* U+0026 AMPERSAND (&)
688
            Switch to the entity in attribute value state. */
689
            $this->entityInAttributeValueState('single');
690
691
        } elseif($this->char === $this->EOF) {
692
            /* EOF
693
            Parse error. Emit the current tag token. Reconsume the character
694
            in the data state. */
695
            $this->emitToken($this->token);
696
697
            $this->char--;
698
            $this->state = 'data';
699
700
        } else {
701
            /* Anything else
702
            Append the current input character to the current attribute's value.
703
            Stay in the attribute value (single-quoted) state. */
704
            $last = count($this->token['attr']) - 1;
705
            $this->token['attr'][$last]['value'] .= $char;
706
707
            $this->state = 'attributeValueSingleQuoted';
708
        }
709
    }
710
711
    private function attributeValueUnquotedState()
712
    {
713
        // Consume the next input character:
714
        $this->char++;
715
        $char = $this->character($this->char);
716
717
        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
718
            /* U+0009 CHARACTER TABULATION
719
            U+000A LINE FEED (LF)
720
            U+000B LINE TABULATION
721
            U+000C FORM FEED (FF)
722
            U+0020 SPACE
723
            Switch to the before attribute name state. */
724
            $this->state = 'beforeAttributeName';
725
726
        } elseif($char === '&') {
727
            /* U+0026 AMPERSAND (&)
728
            Switch to the entity in attribute value state. */
729
            $this->entityInAttributeValueState('non');
730
731
        } elseif($char === '>') {
732
            /* U+003E GREATER-THAN SIGN (>)
733
            Emit the current tag token. Switch to the data state. */
734
            $this->emitToken($this->token);
735
            $this->state = 'data';
736
737
        } else {
738
            /* Anything else
739
            Append the current input character to the current attribute's value.
740
            Stay in the attribute value (unquoted) state. */
741
            $last = count($this->token['attr']) - 1;
742
            $this->token['attr'][$last]['value'] .= $char;
743
744
            $this->state = 'attributeValueUnquoted';
745
        }
746
    }
747
748
    private function entityInAttributeValueState()
749
    {
750
        // Attempt to consume an entity.
751
        $entity = $this->entity();
752
753
        // If nothing is returned, append a U+0026 AMPERSAND character to the
754
        // current attribute's value. Otherwise, emit the character token that
755
        // was returned.
756
        $char = (!$entity)
757
            ? '&'
758
            : $entity;
759
760
        $this->emitToken($char);
761
    }
762
763
    private function bogusCommentState()
764
    {
765
        /* Consume every character up to the first U+003E GREATER-THAN SIGN
766
        character (>) or the end of the file (EOF), whichever comes first. Emit
767
        a comment token whose data is the concatenation of all the characters
768
        starting from and including the character that caused the state machine
769
        to switch into the bogus comment state, up to and including the last
770
        consumed character before the U+003E character, if any, or up to the
771
        end of the file otherwise. (If the comment was started by the end of
772
        the file (EOF), the token is empty.) */
773
        $data = $this->characters('^>', $this->char);
774
        $this->emitToken(array(
775
            'data' => $data,
776
            'type' => self::COMMENT
777
        ));
778
779
        $this->char += strlen($data);
780
781
        /* Switch to the data state. */
782
        $this->state = 'data';
783
784
        /* If the end of the file was reached, reconsume the EOF character. */
785
        if($this->char === $this->EOF) {
786
            $this->char = $this->EOF - 1;
787
        }
788
    }
789
790
    private function markupDeclarationOpenState()
791
    {
792
        /* If the next two characters are both U+002D HYPHEN-MINUS (-)
793
        characters, consume those two characters, create a comment token whose
794
        data is the empty string, and switch to the comment state. */
795
        if($this->character($this->char + 1, 2) === '--') {
796
            $this->char += 2;
797
            $this->state = 'comment';
798
            $this->token = array(
799
                'data' => null,
800
                'type' => self::COMMENT
801
            );
802
803
        /* Otherwise if the next seven chacacters are a case-insensitive match
804
        for the word "DOCTYPE", then consume those characters and switch to the
805
        DOCTYPE state. */
806
        } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') {
807
            $this->char += 7;
808
            $this->state = 'doctype';
809
810
        /* Otherwise, is is a parse error. Switch to the bogus comment state.
811
        The next character that is consumed, if any, is the first character
812
        that will be in the comment. */
813
        } else {
814
            $this->char++;
815
            $this->state = 'bogusComment';
816
        }
817
    }
818
819
    private function commentState()
820
    {
821
        /* Consume the next input character: */
822
        $this->char++;
823
        $char = $this->char();
824
825
        /* U+002D HYPHEN-MINUS (-) */
826
        if($char === '-') {
827
            /* Switch to the comment dash state  */
828
            $this->state = 'commentDash';
829
830
        /* EOF */
831
        } elseif($this->char === $this->EOF) {
832
            /* Parse error. Emit the comment token. Reconsume the EOF character
833
            in the data state. */
834
            $this->emitToken($this->token);
835
            $this->char--;
836
            $this->state = 'data';
837
838
        /* Anything else */
839
        } else {
840
            /* Append the input character to the comment token's data. Stay in
841
            the comment state. */
842
            $this->token['data'] .= $char;
843
        }
844
    }
845
846
    private function commentDashState()
847
    {
848
        /* Consume the next input character: */
849
        $this->char++;
850
        $char = $this->char();
851
852
        /* U+002D HYPHEN-MINUS (-) */
853
        if($char === '-') {
854
            /* Switch to the comment end state  */
855
            $this->state = 'commentEnd';
856
857
        /* EOF */
858
        } elseif($this->char === $this->EOF) {
859
            /* Parse error. Emit the comment token. Reconsume the EOF character
860
            in the data state. */
861
            $this->emitToken($this->token);
862
            $this->char--;
863
            $this->state = 'data';
864
865
        /* Anything else */
866
        } else {
867
            /* Append a U+002D HYPHEN-MINUS (-) character and the input
868
            character to the comment token's data. Switch to the comment state. */
869
            $this->token['data'] .= '-'.$char;
870
            $this->state = 'comment';
871
        }
872
    }
873
874
    private function commentEndState()
875
    {
876
        /* Consume the next input character: */
877
        $this->char++;
878
        $char = $this->char();
879
880
        if($char === '>') {
881
            $this->emitToken($this->token);
882
            $this->state = 'data';
883
884
        } elseif($char === '-') {
885
            $this->token['data'] .= '-';
886
887
        } elseif($this->char === $this->EOF) {
888
            $this->emitToken($this->token);
889
            $this->char--;
890
            $this->state = 'data';
891
892
        } else {
893
            $this->token['data'] .= '--'.$char;
894
            $this->state = 'comment';
895
        }
896
    }
897
898
    private function doctypeState()
899
    {
900
        /* Consume the next input character: */
901
        $this->char++;
902
        $char = $this->char();
903
904
        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
905
            $this->state = 'beforeDoctypeName';
906
907
        } else {
908
            $this->char--;
909
            $this->state = 'beforeDoctypeName';
910
        }
911
    }
912
913
    private function beforeDoctypeNameState()
914
    {
915
        /* Consume the next input character: */
916
        $this->char++;
917
        $char = $this->char();
918
919
        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
920
            // Stay in the before DOCTYPE name state.
921
922
        } elseif(preg_match('/^[a-z]$/', $char)) {
923
            $this->token = array(
924
                'name' => strtoupper($char),
925
                'type' => self::DOCTYPE,
926
                'error' => true
927
            );
928
929
            $this->state = 'doctypeName';
930
931
        } elseif($char === '>') {
932
            $this->emitToken(array(
933
                'name' => null,
934
                'type' => self::DOCTYPE,
935
                'error' => true
936
            ));
937
938
            $this->state = 'data';
939
940
        } elseif($this->char === $this->EOF) {
941
            $this->emitToken(array(
942
                'name' => null,
943
                'type' => self::DOCTYPE,
944
                'error' => true
945
            ));
946
947
            $this->char--;
948
            $this->state = 'data';
949
950
        } else {
951
            $this->token = array(
952
                'name' => $char,
953
                'type' => self::DOCTYPE,
954
                'error' => true
955
            );
956
957
            $this->state = 'doctypeName';
958
        }
959
    }
960
961
    private function doctypeNameState()
962
    {
963
        /* Consume the next input character: */
964
        $this->char++;
965
        $char = $this->char();
966
967
        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
968
            $this->state = 'AfterDoctypeName';
969
970
        } elseif($char === '>') {
971
            $this->emitToken($this->token);
972
            $this->state = 'data';
973
974
        } elseif(preg_match('/^[a-z]$/', $char)) {
975
            $this->token['name'] .= strtoupper($char);
976
977
        } elseif($this->char === $this->EOF) {
978
            $this->emitToken($this->token);
979
            $this->char--;
980
            $this->state = 'data';
981
982
        } else {
983
            $this->token['name'] .= $char;
984
        }
985
986
        $this->token['error'] = ($this->token['name'] === 'HTML')
987
            ? false
988
            : true;
989
    }
990
991
    private function afterDoctypeNameState()
992
    {
993
        /* Consume the next input character: */
994
        $this->char++;
995
        $char = $this->char();
996
997
        if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
998
            // Stay in the DOCTYPE name state.
999
1000
        } elseif($char === '>') {
1001
            $this->emitToken($this->token);
1002
            $this->state = 'data';
1003
1004
        } elseif($this->char === $this->EOF) {
1005
            $this->emitToken($this->token);
1006
            $this->char--;
1007
            $this->state = 'data';
1008
1009
        } else {
1010
            $this->token['error'] = true;
1011
            $this->state = 'bogusDoctype';
1012
        }
1013
    }
1014
1015
    private function bogusDoctypeState()
1016
    {
1017
        /* Consume the next input character: */
1018
        $this->char++;
1019
        $char = $this->char();
1020
1021
        if($char === '>') {
1022
            $this->emitToken($this->token);
1023
            $this->state = 'data';
1024
1025
        } elseif($this->char === $this->EOF) {
1026
            $this->emitToken($this->token);
1027
            $this->char--;
1028
            $this->state = 'data';
1029
1030
        } else {
1031
            // Stay in the bogus DOCTYPE state.
1032
        }
1033
    }
1034
1035
    private function entity()
1036
    {
1037
        $start = $this->char;
1038
1039
        // This section defines how to consume an entity. This definition is
1040
        // used when parsing entities in text and in attributes.
1041
1042
        // The behaviour depends on the identity of the next character (the
1043
        // one immediately after the U+0026 AMPERSAND character):
1044
1045
        switch($this->character($this->char + 1)) {
1046
            // U+0023 NUMBER SIGN (#)
1047
            case '#':
1048
1049
                // The behaviour further depends on the character after the
1050
                // U+0023 NUMBER SIGN:
1051
                switch($this->character($this->char + 1)) {
1052
                    // U+0078 LATIN SMALL LETTER X
1053
                    // U+0058 LATIN CAPITAL LETTER X
1054
                    case 'x':
1055
                    case 'X':
1056
                        // Follow the steps below, but using the range of
1057
                        // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1058
                        // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1059
                        // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1060
                        // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1061
                        // words, 0-9, A-F, a-f).
1062
                        $char = 1;
1063
                        $char_class = '0-9A-Fa-f';
1064
                    break;
1065
1066
                    // Anything else
1067
                    default:
1068
                        // Follow the steps below, but using the range of
1069
                        // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1070
                        // NINE (i.e. just 0-9).
1071
                        $char = 0;
1072
                        $char_class = '0-9';
1073
                    break;
1074
                }
1075
1076
                // Consume as many characters as match the range of characters
1077
                // given above.
1078
                $this->char++;
1079
                $e_name = $this->characters($char_class, $this->char + $char + 1);
1080
                $entity = $this->character($start, $this->char);
1081
                $cond = strlen($e_name) > 0;
1082
1083
                // The rest of the parsing happens below.
1084
            break;
1085
1086
            // Anything else
1087
            default:
1088
                // Consume the maximum number of characters possible, with the
1089
                // consumed characters case-sensitively matching one of the
1090
                // identifiers in the first column of the entities table.
1091
                $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1092
                $len = strlen($e_name);
1093
1094
                for($c = 1; $c <= $len; $c++) {
1095
                    $id = substr($e_name, 0, $c);
1096
                    $this->char++;
1097
1098
                    if(in_array($id, $this->entities)) {
1099
                        $entity = $id;
1100
                        break;
1101
                    }
1102
                }
1103
1104
                $cond = isset($entity);
1105
                // The rest of the parsing happens below.
1106
            break;
1107
        }
1108
1109
        if(!$cond) {
1110
            // If no match can be made, then this is a parse error. No
1111
            // characters are consumed, and nothing is returned.
1112
            $this->char = $start;
1113
            return false;
1114
        }
1115
1116
        // Return a character token for the character corresponding to the
1117
        // entity name (as given by the second column of the entities table).
1118
        return html_entity_decode('&'.$entity.';', ENT_QUOTES, 'UTF-8');
1119
    }
1120
1121
    private function emitToken($token)
1122
    {
1123
        $emit = $this->tree->emitToken($token);
1124
1125
        if(is_int($emit)) {
1126
            $this->content_model = $emit;
1127
1128
        } elseif($token['type'] === self::ENDTAG) {
1129
            $this->content_model = self::PCDATA;
1130
        }
1131
    }
1132
1133
    private function EOF()
1134
    {
1135
        $this->state = null;
1136
        $this->tree->emitToken(array(
1137
            'type' => self::EOF
1138
        ));
1139
    }
1140
}
1141
1142
class HTML5TreeConstructer
1143
{
1144
    public $stack = array();
1145
1146
    private $phase;
1147
    private $mode;
1148
    private $dom;
1149
    private $foster_parent = null;
1150
    private $a_formatting  = array();
1151
1152
    private $head_pointer = null;
1153
    private $form_pointer = null;
1154
1155
    private $scoping = array('button','caption','html','marquee','object','table','td','th');
1156
    private $formatting = array('a','b','big','em','font','i','nobr','s','small','strike','strong','tt','u');
1157
    private $special = array('address','area','base','basefont','bgsound',
1158
    'blockquote','body','br','center','col','colgroup','dd','dir','div','dl',
1159
    'dt','embed','fieldset','form','frame','frameset','h1','h2','h3','h4','h5',
1160
    'h6','head','hr','iframe','image','img','input','isindex','li','link',
1161
    'listing','menu','meta','noembed','noframes','noscript','ol','optgroup',
1162
    'option','p','param','plaintext','pre','script','select','spacer','style',
1163
    'tbody','textarea','tfoot','thead','title','tr','ul','wbr');
1164
1165
    // The different phases.
1166
    const INIT_PHASE = 0;
1167
    const ROOT_PHASE = 1;
1168
    const MAIN_PHASE = 2;
1169
    const END_PHASE  = 3;
1170
1171
    // The different insertion modes for the main phase.
1172
    const BEFOR_HEAD = 0;
1173
    const IN_HEAD    = 1;
1174
    const AFTER_HEAD = 2;
1175
    const IN_BODY    = 3;
1176
    const IN_TABLE   = 4;
1177
    const IN_CAPTION = 5;
1178
    const IN_CGROUP  = 6;
1179
    const IN_TBODY   = 7;
1180
    const IN_ROW     = 8;
1181
    const IN_CELL    = 9;
1182
    const IN_SELECT  = 10;
1183
    const AFTER_BODY = 11;
1184
    const IN_FRAME   = 12;
1185
    const AFTR_FRAME = 13;
1186
1187
    // The different types of elements.
1188
    const SPECIAL    = 0;
1189
    const SCOPING    = 1;
1190
    const FORMATTING = 2;
1191
    const PHRASING   = 3;
1192
1193
    const MARKER     = 0;
1194
1195
    public function __construct()
1196
    {
1197
        $this->phase = self::INIT_PHASE;
1198
        $this->mode = self::BEFOR_HEAD;
1199
        $this->dom = new DOMDocument;
1200
1201
        $this->dom->encoding = 'UTF-8';
1202
        $this->dom->preserveWhiteSpace = true;
1203
        $this->dom->substituteEntities = true;
1204
        $this->dom->strictErrorChecking = false;
1205
    }
1206
1207
    // Process tag tokens
1208
    public function emitToken($token)
1209
    {
1210
        switch($this->phase) {
1211
            case self::INIT_PHASE: return $this->initPhase($token); break;
1212
            case self::ROOT_PHASE: return $this->rootElementPhase($token); break;
1213
            case self::MAIN_PHASE: return $this->mainPhase($token); break;
1214
            case self::END_PHASE : return $this->trailingEndPhase($token); break;
1215
        }
1216
    }
1217
1218
    private function initPhase($token)
1219
    {
1220
        /* Initially, the tree construction stage must handle each token
1221
        emitted from the tokenisation stage as follows: */
1222
1223
        /* A DOCTYPE token that is marked as being in error
1224
        A comment token
1225
        A start tag token
1226
        An end tag token
1227
        A character token that is not one of one of U+0009 CHARACTER TABULATION,
1228
            U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1229
            or U+0020 SPACE
1230
        An end-of-file token */
1231
        if((isset($token['error']) && $token['error']) ||
1232
        $token['type'] === HTML5::COMMENT ||
1233
        $token['type'] === HTML5::STARTTAG ||
1234
        $token['type'] === HTML5::ENDTAG ||
1235
        $token['type'] === HTML5::EOF ||
1236
        ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1237
        !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))) {
1238
            /* This specification does not define how to handle this case. In
1239
            particular, user agents may ignore the entirety of this specification
1240
            altogether for such documents, and instead invoke special parse modes
1241
            with a greater emphasis on backwards compatibility. */
1242
1243
            $this->phase = self::ROOT_PHASE;
1244
            return $this->rootElementPhase($token);
1245
1246
        /* A DOCTYPE token marked as being correct */
1247
        } elseif(isset($token['error']) && !$token['error']) {
1248
            /* Append a DocumentType node to the Document  node, with the name
1249
            attribute set to the name given in the DOCTYPE token (which will be
1250
            "HTML"), and the other attributes specific to DocumentType objects
1251
            set to null, empty lists, or the empty string as appropriate. */
1252
            $doctype = new DOMDocumentType(null, null, 'HTML');
1253
1254
            /* Then, switch to the root element phase of the tree construction
1255
            stage. */
1256
            $this->phase = self::ROOT_PHASE;
1257
1258
        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1259
        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1260
        or U+0020 SPACE */
1261
        } elseif(isset($token['data']) && preg_match('/^[\t\n\x0b\x0c ]+$/',
1262
        $token['data'])) {
1263
            /* Append that character  to the Document node. */
1264
            $text = $this->dom->createTextNode($token['data']);
1265
            $this->dom->appendChild($text);
1266
        }
1267
    }
1268
1269
    private function rootElementPhase($token)
1270
    {
1271
        /* After the initial phase, as each token is emitted from the tokenisation
1272
        stage, it must be processed as described in this section. */
1273
1274
        /* A DOCTYPE token */
1275
        if($token['type'] === HTML5::DOCTYPE) {
1276
            // Parse error. Ignore the token.
1277
1278
        /* A comment token */
1279
        } elseif($token['type'] === HTML5::COMMENT) {
1280
            /* Append a Comment node to the Document object with the data
1281
            attribute set to the data given in the comment token. */
1282
            $comment = $this->dom->createComment($token['data']);
1283
            $this->dom->appendChild($comment);
1284
1285
        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1286
        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1287
        or U+0020 SPACE */
1288
        } elseif($token['type'] === HTML5::CHARACTR &&
1289
        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1290
            /* Append that character  to the Document node. */
1291
            $text = $this->dom->createTextNode($token['data']);
1292
            $this->dom->appendChild($text);
1293
1294
        /* A character token that is not one of U+0009 CHARACTER TABULATION,
1295
            U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1296
            (FF), or U+0020 SPACE
1297
        A start tag token
1298
        An end tag token
1299
        An end-of-file token */
1300
        } elseif(($token['type'] === HTML5::CHARACTR &&
1301
        !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1302
        $token['type'] === HTML5::STARTTAG ||
1303
        $token['type'] === HTML5::ENDTAG ||
1304
        $token['type'] === HTML5::EOF) {
1305
            /* Create an HTMLElement node with the tag name html, in the HTML
1306
            namespace. Append it to the Document object. Switch to the main
1307
            phase and reprocess the current token. */
1308
            $html = $this->dom->createElement('html');
1309
            $this->dom->appendChild($html);
1310
            $this->stack[] = $html;
1311
1312
            $this->phase = self::MAIN_PHASE;
1313
            return $this->mainPhase($token);
1314
        }
1315
    }
1316
1317
    private function mainPhase($token)
1318
    {
1319
        /* Tokens in the main phase must be handled as follows: */
1320
1321
        /* A DOCTYPE token */
1322
        if($token['type'] === HTML5::DOCTYPE) {
1323
            // Parse error. Ignore the token.
1324
1325
        /* A start tag token with the tag name "html" */
1326
        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1327
            /* If this start tag token was not the first start tag token, then
1328
            it is a parse error. */
1329
1330
            /* For each attribute on the token, check to see if the attribute
1331
            is already present on the top element of the stack of open elements.
1332
            If it is not, add the attribute and its corresponding value to that
1333
            element. */
1334
            foreach($token['attr'] as $attr) {
1335
                if(!$this->stack[0]->hasAttribute($attr['name'])) {
1336
                    $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1337
                }
1338
            }
1339
1340
        /* An end-of-file token */
1341
        } elseif($token['type'] === HTML5::EOF) {
1342
            /* Generate implied end tags. */
1343
            $this->generateImpliedEndTags();
1344
1345
        /* Anything else. */
1346
        } else {
1347
            /* Depends on the insertion mode: */
1348
            switch($this->mode) {
1349
                case self::BEFOR_HEAD: return $this->beforeHead($token); break;
1350
                case self::IN_HEAD:    return $this->inHead($token); break;
1351
                case self::AFTER_HEAD: return $this->afterHead($token); break;
1352
                case self::IN_BODY:    return $this->inBody($token); break;
1353
                case self::IN_TABLE:   return $this->inTable($token); break;
1354
                case self::IN_CAPTION: return $this->inCaption($token); break;
1355
                case self::IN_CGROUP:  return $this->inColumnGroup($token); break;
1356
                case self::IN_TBODY:   return $this->inTableBody($token); break;
1357
                case self::IN_ROW:     return $this->inRow($token); break;
1358
                case self::IN_CELL:    return $this->inCell($token); break;
1359
                case self::IN_SELECT:  return $this->inSelect($token); break;
1360
                case self::AFTER_BODY: return $this->afterBody($token); break;
1361
                case self::IN_FRAME:   return $this->inFrameset($token); break;
1362
                case self::AFTR_FRAME: return $this->afterFrameset($token); break;
1363
                case self::END_PHASE:  return $this->trailingEndPhase($token); break;
1364
            }
1365
        }
1366
    }
1367
1368
    private function beforeHead($token)
1369
    {
1370
        /* Handle the token as follows: */
1371
1372
        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1373
        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1374
        or U+0020 SPACE */
1375
        if($token['type'] === HTML5::CHARACTR &&
1376
        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1377
            /* Append the character to the current node. */
1378
            $this->insertText($token['data']);
1379
1380
        /* A comment token */
1381
        } elseif($token['type'] === HTML5::COMMENT) {
1382
            /* Append a Comment node to the current node with the data attribute
1383
            set to the data given in the comment token. */
1384
            $this->insertComment($token['data']);
1385
1386
        /* A start tag token with the tag name "head" */
1387
        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1388
            /* Create an element for the token, append the new element to the
1389
            current node and push it onto the stack of open elements. */
1390
            $element = $this->insertElement($token);
1391
1392
            /* Set the head element pointer to this new element node. */
1393
            $this->head_pointer = $element;
1394
1395
            /* Change the insertion mode to "in head". */
1396
            $this->mode = self::IN_HEAD;
1397
1398
        /* A start tag token whose tag name is one of: "base", "link", "meta",
1399
        "script", "style", "title". Or an end tag with the tag name "html".
1400
        Or a character token that is not one of U+0009 CHARACTER TABULATION,
1401
        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1402
        or U+0020 SPACE. Or any other start tag token */
1403
        } elseif($token['type'] === HTML5::STARTTAG ||
1404
        ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1405
        ($token['type'] === HTML5::CHARACTR && !preg_match('/^[\t\n\x0b\x0c ]$/',
1406
        $token['data']))) {
1407
            /* Act as if a start tag token with the tag name "head" and no
1408
            attributes had been seen, then reprocess the current token. */
1409
            $this->beforeHead(array(
1410
                'name' => 'head',
1411
                'type' => HTML5::STARTTAG,
1412
                'attr' => array()
1413
            ));
1414
1415
            return $this->inHead($token);
1416
1417
        /* Any other end tag */
1418
        } elseif($token['type'] === HTML5::ENDTAG) {
1419
            /* Parse error. Ignore the token. */
1420
        }
1421
    }
1422
1423
    private function inHead($token)
1424
    {
1425
        /* Handle the token as follows: */
1426
1427
        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1428
        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1429
        or U+0020 SPACE.
1430
1431
        THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1432
        or script element, append the character to the current node regardless
1433
        of its content. */
1434
        if(($token['type'] === HTML5::CHARACTR &&
1435
        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1436
        $token['type'] === HTML5::CHARACTR && in_array(end($this->stack)->nodeName,
1437
        array('title', 'style', 'script')))) {
1438
            /* Append the character to the current node. */
1439
            $this->insertText($token['data']);
1440
1441
        /* A comment token */
1442
        } elseif($token['type'] === HTML5::COMMENT) {
1443
            /* Append a Comment node to the current node with the data attribute
1444
            set to the data given in the comment token. */
1445
            $this->insertComment($token['data']);
1446
1447
        } elseif($token['type'] === HTML5::ENDTAG &&
1448
        in_array($token['name'], array('title', 'style', 'script'))) {
1449
            array_pop($this->stack);
1450
            return HTML5::PCDATA;
1451
1452
        /* A start tag with the tag name "title" */
1453
        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
1454
            /* Create an element for the token and append the new element to the
1455
            node pointed to by the head element pointer, or, if that is null
1456
            (innerHTML case), to the current node. */
1457
            if($this->head_pointer !== null) {
1458
                $element = $this->insertElement($token, false);
1459
                $this->head_pointer->appendChild($element);
1460
1461
            } else {
1462
                $element = $this->insertElement($token);
1463
            }
1464
1465
            /* Switch the tokeniser's content model flag  to the RCDATA state. */
1466
            return HTML5::RCDATA;
1467
1468
        /* A start tag with the tag name "style" */
1469
        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
1470
            /* Create an element for the token and append the new element to the
1471
            node pointed to by the head element pointer, or, if that is null
1472
            (innerHTML case), to the current node. */
1473
            if($this->head_pointer !== null) {
1474
                $element = $this->insertElement($token, false);
1475
                $this->head_pointer->appendChild($element);
1476
1477
            } else {
1478
                $this->insertElement($token);
1479
            }
1480
1481
            /* Switch the tokeniser's content model flag  to the CDATA state. */
1482
            return HTML5::CDATA;
1483
1484
        /* A start tag with the tag name "script" */
1485
        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
1486
            /* Create an element for the token. */
1487
            $element = $this->insertElement($token, false);
1488
            $this->head_pointer->appendChild($element);
1489
1490
            /* Switch the tokeniser's content model flag  to the CDATA state. */
1491
            return HTML5::CDATA;
1492
1493
        /* A start tag with the tag name "base", "link", or "meta" */
1494
        } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1495
        array('base', 'link', 'meta'))) {
1496
            /* Create an element for the token and append the new element to the
1497
            node pointed to by the head element pointer, or, if that is null
1498
            (innerHTML case), to the current node. */
1499
            if($this->head_pointer !== null) {
1500
                $element = $this->insertElement($token, false);
1501
                $this->head_pointer->appendChild($element);
1502
                array_pop($this->stack);
1503
1504
            } else {
1505
                $this->insertElement($token);
1506
            }
1507
1508
        /* An end tag with the tag name "head" */
1509
        } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
1510
            /* If the current node is a head element, pop the current node off
1511
            the stack of open elements. */
1512
            if($this->head_pointer->isSameNode(end($this->stack))) {
1513
                array_pop($this->stack);
1514
1515
            /* Otherwise, this is a parse error. */
1516
            } else {
1517
                // k
1518
            }
1519
1520
            /* Change the insertion mode to "after head". */
1521
            $this->mode = self::AFTER_HEAD;
1522
1523
        /* A start tag with the tag name "head" or an end tag except "html". */
1524
        } elseif(($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
1525
        ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')) {
1526
            // Parse error. Ignore the token.
1527
1528
        /* Anything else */
1529
        } else {
1530
            /* If the current node is a head element, act as if an end tag
1531
            token with the tag name "head" had been seen. */
1532
            if($this->head_pointer->isSameNode(end($this->stack))) {
1533
                $this->inHead(array(
1534
                    'name' => 'head',
1535
                    'type' => HTML5::ENDTAG
1536
                ));
1537
1538
            /* Otherwise, change the insertion mode to "after head". */
1539
            } else {
1540
                $this->mode = self::AFTER_HEAD;
1541
            }
1542
1543
            /* Then, reprocess the current token. */
1544
            return $this->afterHead($token);
1545
        }
1546
    }
1547
1548
    private function afterHead($token)
1549
    {
1550
        /* Handle the token as follows: */
1551
1552
        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1553
        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1554
        or U+0020 SPACE */
1555
        if($token['type'] === HTML5::CHARACTR &&
1556
        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1557
            /* Append the character to the current node. */
1558
            $this->insertText($token['data']);
1559
1560
        /* A comment token */
1561
        } elseif($token['type'] === HTML5::COMMENT) {
1562
            /* Append a Comment node to the current node with the data attribute
1563
            set to the data given in the comment token. */
1564
            $this->insertComment($token['data']);
1565
1566
        /* A start tag token with the tag name "body" */
1567
        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
1568
            /* Insert a body element for the token. */
1569
            $this->insertElement($token);
1570
1571
            /* Change the insertion mode to "in body". */
1572
            $this->mode = self::IN_BODY;
1573
1574
        /* A start tag token with the tag name "frameset" */
1575
        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
1576
            /* Insert a frameset element for the token. */
1577
            $this->insertElement($token);
1578
1579
            /* Change the insertion mode to "in frameset". */
1580
            $this->mode = self::IN_FRAME;
1581
1582
        /* A start tag token whose tag name is one of: "base", "link", "meta",
1583
        "script", "style", "title" */
1584
        } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1585
        array('base', 'link', 'meta', 'script', 'style', 'title'))) {
1586
            /* Parse error. Switch the insertion mode back to "in head" and
1587
            reprocess the token. */
1588
            $this->mode = self::IN_HEAD;
1589
            return $this->inHead($token);
1590
1591
        /* Anything else */
1592
        } else {
1593
            /* Act as if a start tag token with the tag name "body" and no
1594
            attributes had been seen, and then reprocess the current token. */
1595
            $this->afterHead(array(
1596
                'name' => 'body',
1597
                'type' => HTML5::STARTTAG,
1598
                'attr' => array()
1599
            ));
1600
1601
            return $this->inBody($token);
1602
        }
1603
    }
1604
1605
    private function inBody($token)
1606
    {
1607
        /* Handle the token as follows: */
1608
1609
        switch($token['type']) {
1610
            /* A character token */
1611
            case HTML5::CHARACTR:
1612
                /* Reconstruct the active formatting elements, if any. */
1613
                $this->reconstructActiveFormattingElements();
1614
1615
                /* Append the token's character to the current node. */
1616
                $this->insertText($token['data']);
1617
            break;
1618
1619
            /* A comment token */
1620
            case HTML5::COMMENT:
1621
                /* Append a Comment node to the current node with the data
1622
                attribute set to the data given in the comment token. */
1623
                $this->insertComment($token['data']);
1624
            break;
1625
1626
            case HTML5::STARTTAG:
1627
            switch($token['name']) {
1628
                /* A start tag token whose tag name is one of: "script",
1629
                "style" */
1630
                case 'script': case 'style':
1631
                    /* Process the token as if the insertion mode had been "in
1632
                    head". */
1633
                    return $this->inHead($token);
1634
                break;
1635
1636
                /* A start tag token whose tag name is one of: "base", "link",
1637
                "meta", "title" */
1638
                case 'base': case 'link': case 'meta': case 'title':
1639
                    /* Parse error. Process the token as if the insertion mode
1640
                    had    been "in head". */
1641
                    return $this->inHead($token);
1642
                break;
1643
1644
                /* A start tag token with the tag name "body" */
1645
                case 'body':
1646
                    /* Parse error. If the second element on the stack of open
1647
                    elements is not a body element, or, if the stack of open
1648
                    elements has only one node on it, then ignore the token.
1649
                    (innerHTML case) */
1650
                    if(count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
1651
                        // Ignore
1652
1653
                    /* Otherwise, for each attribute on the token, check to see
1654
                    if the attribute is already present on the body element (the
1655
                    second element)    on the stack of open elements. If it is not,
1656
                    add the attribute and its corresponding value to that
1657
                    element. */
1658
                    } else {
1659
                        foreach($token['attr'] as $attr) {
1660
                            if(!$this->stack[1]->hasAttribute($attr['name'])) {
1661
                                $this->stack[1]->setAttribute($attr['name'], $attr['value']);
1662
                            }
1663
                        }
1664
                    }
1665
                break;
1666
1667
                /* A start tag whose tag name is one of: "address",
1668
                "blockquote", "center", "dir", "div", "dl", "fieldset",
1669
                "listing", "menu", "ol", "p", "ul" */
1670
                case 'address': case 'blockquote': case 'center': case 'dir':
1671
                case 'div': case 'dl': case 'fieldset': case 'listing':
1672
                case 'menu': case 'ol': case 'p': case 'ul':
1673
                    /* If the stack of open elements has a p element in scope,
1674
                    then act as if an end tag with the tag name p had been
1675
                    seen. */
1676
                    if($this->elementInScope('p')) {
1677
                        $this->emitToken(array(
1678
                            'name' => 'p',
1679
                            'type' => HTML5::ENDTAG
1680
                        ));
1681
                    }
1682
1683
                    /* Insert an HTML element for the token. */
1684
                    $this->insertElement($token);
1685
                break;
1686
1687
                /* A start tag whose tag name is "form" */
1688
                case 'form':
1689
                    /* If the form element pointer is not null, ignore the
1690
                    token with a parse error. */
1691
                    if($this->form_pointer !== null) {
1692
                        // Ignore.
1693
1694
                    /* Otherwise: */
1695
                    } else {
1696
                        /* If the stack of open elements has a p element in
1697
                        scope, then act as if an end tag with the tag name p
1698
                        had been seen. */
1699
                        if($this->elementInScope('p')) {
1700
                            $this->emitToken(array(
1701
                                'name' => 'p',
1702
                                'type' => HTML5::ENDTAG
1703
                            ));
1704
                        }
1705
1706
                        /* Insert an HTML element for the token, and set the
1707
                        form element pointer to point to the element created. */
1708
                        $element = $this->insertElement($token);
1709
                        $this->form_pointer = $element;
1710
                    }
1711
                break;
1712
1713
                /* A start tag whose tag name is "li", "dd" or "dt" */
1714
                case 'li': case 'dd': case 'dt':
1715
                    /* If the stack of open elements has a p  element in scope,
1716
                    then act as if an end tag with the tag name p had been
1717
                    seen. */
1718
                    if($this->elementInScope('p')) {
1719
                        $this->emitToken(array(
1720
                            'name' => 'p',
1721
                            'type' => HTML5::ENDTAG
1722
                        ));
1723
                    }
1724
1725
                    $stack_length = count($this->stack) - 1;
1726
1727
                    for($n = $stack_length; 0 <= $n; $n--) {
1728
                        /* 1. Initialise node to be the current node (the
1729
                        bottommost node of the stack). */
1730
                        $stop = false;
1731
                        $node = $this->stack[$n];
1732
                        $cat  = $this->getElementCategory($node->tagName);
1733
1734
                        /* 2. If node is an li, dd or dt element, then pop all
1735
                        the    nodes from the current node up to node, including
1736
                        node, then stop this algorithm. */
1737
                        if($token['name'] === $node->tagName ||    ($token['name'] !== 'li'
1738
                        && ($node->tagName === 'dd' || $node->tagName === 'dt'))) {
1739
                            for($x = $stack_length; $x >= $n ; $x--) {
1740
                                array_pop($this->stack);
1741
                            }
1742
1743
                            break;
1744
                        }
1745
1746
                        /* 3. If node is not in the formatting category, and is
1747
                        not    in the phrasing category, and is not an address or
1748
                        div element, then stop this algorithm. */
1749
                        if($cat !== self::FORMATTING && $cat !== self::PHRASING &&
1750
                        $node->tagName !== 'address' && $node->tagName !== 'div') {
1751
                            break;
1752
                        }
1753
                    }
1754
1755
                    /* Finally, insert an HTML element with the same tag
1756
                    name as the    token's. */
1757
                    $this->insertElement($token);
1758
                break;
1759
1760
                /* A start tag token whose tag name is "plaintext" */
1761
                case 'plaintext':
1762
                    /* If the stack of open elements has a p  element in scope,
1763
                    then act as if an end tag with the tag name p had been
1764
                    seen. */
1765
                    if($this->elementInScope('p')) {
1766
                        $this->emitToken(array(
1767
                            'name' => 'p',
1768
                            'type' => HTML5::ENDTAG
1769
                        ));
1770
                    }
1771
1772
                    /* Insert an HTML element for the token. */
1773
                    $this->insertElement($token);
1774
1775
                    return HTML5::PLAINTEXT;
1776
                break;
1777
1778
                /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
1779
                "h5", "h6" */
1780
                case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
1781
                    /* If the stack of open elements has a p  element in scope,
1782
                    then act as if an end tag with the tag name p had been seen. */
1783
                    if($this->elementInScope('p')) {
1784
                        $this->emitToken(array(
1785
                            'name' => 'p',
1786
                            'type' => HTML5::ENDTAG
1787
                        ));
1788
                    }
1789
1790
                    /* If the stack of open elements has in scope an element whose
1791
                    tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
1792
                    this is a parse error; pop elements from the stack until an
1793
                    element with one of those tag names has been popped from the
1794
                    stack. */
1795
                    while($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
1796
                        array_pop($this->stack);
1797
                    }
1798
1799
                    /* Insert an HTML element for the token. */
1800
                    $this->insertElement($token);
1801
                break;
1802
1803
                /* A start tag whose tag name is "a" */
1804
                case 'a':
1805
                    /* If the list of active formatting elements contains
1806
                    an element whose tag name is "a" between the end of the
1807
                    list and the last marker on the list (or the start of
1808
                    the list if there is no marker on the list), then this
1809
                    is a parse error; act as if an end tag with the tag name
1810
                    "a" had been seen, then remove that element from the list
1811
                    of active formatting elements and the stack of open
1812
                    elements if the end tag didn't already remove it (it
1813
                    might not have if the element is not in table scope). */
1814
                    $leng = count($this->a_formatting);
1815
1816
                    for($n = $leng - 1; $n >= 0; $n--) {
1817
                        if($this->a_formatting[$n] === self::MARKER) {
1818
                            break;
1819
1820
                        } elseif($this->a_formatting[$n]->nodeName === 'a') {
1821
                            $this->emitToken(array(
1822
                                'name' => 'a',
1823
                                'type' => HTML5::ENDTAG
1824
                            ));
1825
                            break;
1826
                        }
1827
                    }
1828
1829
                    /* Reconstruct the active formatting elements, if any. */
1830
                    $this->reconstructActiveFormattingElements();
1831
1832
                    /* Insert an HTML element for the token. */
1833
                    $el = $this->insertElement($token);
1834
1835
                    /* Add that element to the list of active formatting
1836
                    elements. */
1837
                    $this->a_formatting[] = $el;
1838
                break;
1839
1840
                /* A start tag whose tag name is one of: "b", "big", "em", "font",
1841
                "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
1842
                case 'b': case 'big': case 'em': case 'font': case 'i':
1843
                case 'nobr': case 's': case 'small': case 'strike':
1844
                case 'strong': case 'tt': case 'u':
1845
                    /* Reconstruct the active formatting elements, if any. */
1846
                    $this->reconstructActiveFormattingElements();
1847
1848
                    /* Insert an HTML element for the token. */
1849
                    $el = $this->insertElement($token);
1850
1851
                    /* Add that element to the list of active formatting
1852
                    elements. */
1853
                    $this->a_formatting[] = $el;
1854
                break;
1855
1856
                /* A start tag token whose tag name is "button" */
1857
                case 'button':
1858
                    /* If the stack of open elements has a button element in scope,
1859
                    then this is a parse error; act as if an end tag with the tag
1860
                    name "button" had been seen, then reprocess the token. (We don't
1861
                    do that. Unnecessary.) */
1862
                    if($this->elementInScope('button')) {
1863
                        $this->inBody(array(
1864
                            'name' => 'button',
1865
                            'type' => HTML5::ENDTAG
1866
                        ));
1867
                    }
1868
1869
                    /* Reconstruct the active formatting elements, if any. */
1870
                    $this->reconstructActiveFormattingElements();
1871
1872
                    /* Insert an HTML element for the token. */
1873
                    $this->insertElement($token);
1874
1875
                    /* Insert a marker at the end of the list of active
1876
                    formatting elements. */
1877
                    $this->a_formatting[] = self::MARKER;
1878
                break;
1879
1880
                /* A start tag token whose tag name is one of: "marquee", "object" */
1881
                case 'marquee': case 'object':
1882
                    /* Reconstruct the active formatting elements, if any. */
1883
                    $this->reconstructActiveFormattingElements();
1884
1885
                    /* Insert an HTML element for the token. */
1886
                    $this->insertElement($token);
1887
1888
                    /* Insert a marker at the end of the list of active
1889
                    formatting elements. */
1890
                    $this->a_formatting[] = self::MARKER;
1891
                break;
1892
1893
                /* A start tag token whose tag name is "xmp" */
1894
                case 'xmp':
1895
                    /* Reconstruct the active formatting elements, if any. */
1896
                    $this->reconstructActiveFormattingElements();
1897
1898
                    /* Insert an HTML element for the token. */
1899
                    $this->insertElement($token);
1900
1901
                    /* Switch the content model flag to the CDATA state. */
1902
                    return HTML5::CDATA;
1903
                break;
1904
1905
                /* A start tag whose tag name is "table" */
1906
                case 'table':
1907
                    /* If the stack of open elements has a p element in scope,
1908
                    then act as if an end tag with the tag name p had been seen. */
1909
                    if($this->elementInScope('p')) {
1910
                        $this->emitToken(array(
1911
                            'name' => 'p',
1912
                            'type' => HTML5::ENDTAG
1913
                        ));
1914
                    }
1915
1916
                    /* Insert an HTML element for the token. */
1917
                    $this->insertElement($token);
1918
1919
                    /* Change the insertion mode to "in table". */
1920
                    $this->mode = self::IN_TABLE;
1921
                break;
1922
1923
                /* A start tag whose tag name is one of: "area", "basefont",
1924
                "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
1925
                case 'area': case 'basefont': case 'bgsound': case 'br':
1926
                case 'embed': case 'img': case 'param': case 'spacer':
1927
                case 'wbr':
1928
                    /* Reconstruct the active formatting elements, if any. */
1929
                    $this->reconstructActiveFormattingElements();
1930
1931
                    /* Insert an HTML element for the token. */
1932
                    $this->insertElement($token);
1933
1934
                    /* Immediately pop the current node off the stack of open elements. */
1935
                    array_pop($this->stack);
1936
                break;
1937
1938
                /* A start tag whose tag name is "hr" */
1939
                case 'hr':
1940
                    /* If the stack of open elements has a p element in scope,
1941
                    then act as if an end tag with the tag name p had been seen. */
1942
                    if($this->elementInScope('p')) {
1943
                        $this->emitToken(array(
1944
                            'name' => 'p',
1945
                            'type' => HTML5::ENDTAG
1946
                        ));
1947
                    }
1948
1949
                    /* Insert an HTML element for the token. */
1950
                    $this->insertElement($token);
1951
1952
                    /* Immediately pop the current node off the stack of open elements. */
1953
                    array_pop($this->stack);
1954
                break;
1955
1956
                /* A start tag whose tag name is "image" */
1957
                case 'image':
1958
                    /* Parse error. Change the token's tag name to "img" and
1959
                    reprocess it. (Don't ask.) */
1960
                    $token['name'] = 'img';
1961
                    return $this->inBody($token);
1962
                break;
1963
1964
                /* A start tag whose tag name is "input" */
1965
                case 'input':
1966
                    /* Reconstruct the active formatting elements, if any. */
1967
                    $this->reconstructActiveFormattingElements();
1968
1969
                    /* Insert an input element for the token. */
1970
                    $element = $this->insertElement($token, false);
1971
1972
                    /* If the form element pointer is not null, then associate the
1973
                    input element with the form element pointed to by the form
1974
                    element pointer. */
1975
                    $this->form_pointer !== null
1976
                        ? $this->form_pointer->appendChild($element)
1977
                        : end($this->stack)->appendChild($element);
1978
1979
                    /* Pop that input element off the stack of open elements. */
1980
                    array_pop($this->stack);
1981
                break;
1982
1983
                /* A start tag whose tag name is "isindex" */
1984
                case 'isindex':
1985
                    /* Parse error. */
1986
                    // w/e
1987
1988
                    /* If the form element pointer is not null,
1989
                    then ignore the token. */
1990
                    if($this->form_pointer === null) {
1991
                        /* Act as if a start tag token with the tag name "form" had
1992
                        been seen. */
1993
                        $this->inBody(array(
1994
                            'name' => 'body',
1995
                            'type' => HTML5::STARTTAG,
1996
                            'attr' => array()
1997
                        ));
1998
1999
                        /* Act as if a start tag token with the tag name "hr" had
2000
                        been seen. */
2001
                        $this->inBody(array(
2002
                            'name' => 'hr',
2003
                            'type' => HTML5::STARTTAG,
2004
                            'attr' => array()
2005
                        ));
2006
2007
                        /* Act as if a start tag token with the tag name "p" had
2008
                        been seen. */
2009
                        $this->inBody(array(
2010
                            'name' => 'p',
2011
                            'type' => HTML5::STARTTAG,
2012
                            'attr' => array()
2013
                        ));
2014
2015
                        /* Act as if a start tag token with the tag name "label"
2016
                        had been seen. */
2017
                        $this->inBody(array(
2018
                            'name' => 'label',
2019
                            'type' => HTML5::STARTTAG,
2020
                            'attr' => array()
2021
                        ));
2022
2023
                        /* Act as if a stream of character tokens had been seen. */
2024
                        $this->insertText('This is a searchable index. '.
2025
                        'Insert your search keywords here: ');
2026
2027
                        /* Act as if a start tag token with the tag name "input"
2028
                        had been seen, with all the attributes from the "isindex"
2029
                        token, except with the "name" attribute set to the value
2030
                        "isindex" (ignoring any explicit "name" attribute). */
2031
                        $attr = $token['attr'];
2032
                        $attr[] = array('name' => 'name', 'value' => 'isindex');
2033
2034
                        $this->inBody(array(
2035
                            'name' => 'input',
2036
                            'type' => HTML5::STARTTAG,
2037
                            'attr' => $attr
2038
                        ));
2039
2040
                        /* Act as if a stream of character tokens had been seen
2041
                        (see below for what they should say). */
2042
                        $this->insertText('This is a searchable index. '.
2043
                        'Insert your search keywords here: ');
2044
2045
                        /* Act as if an end tag token with the tag name "label"
2046
                        had been seen. */
2047
                        $this->inBody(array(
2048
                            'name' => 'label',
2049
                            'type' => HTML5::ENDTAG
2050
                        ));
2051
2052
                        /* Act as if an end tag token with the tag name "p" had
2053
                        been seen. */
2054
                        $this->inBody(array(
2055
                            'name' => 'p',
2056
                            'type' => HTML5::ENDTAG
2057
                        ));
2058
2059
                        /* Act as if a start tag token with the tag name "hr" had
2060
                        been seen. */
2061
                        $this->inBody(array(
2062
                            'name' => 'hr',
2063
                            'type' => HTML5::ENDTAG
2064
                        ));
2065
2066
                        /* Act as if an end tag token with the tag name "form" had
2067
                        been seen. */
2068
                        $this->inBody(array(
2069
                            'name' => 'form',
2070
                            'type' => HTML5::ENDTAG
2071
                        ));
2072
                    }
2073
                break;
2074
2075
                /* A start tag whose tag name is "textarea" */
2076
                case 'textarea':
2077
                    $this->insertElement($token);
2078
2079
                    /* Switch the tokeniser's content model flag to the
2080
                    RCDATA state. */
2081
                    return HTML5::RCDATA;
2082
                break;
2083
2084
                /* A start tag whose tag name is one of: "iframe", "noembed",
2085
                "noframes" */
2086
                case 'iframe': case 'noembed': case 'noframes':
2087
                    $this->insertElement($token);
2088
2089
                    /* Switch the tokeniser's content model flag to the CDATA state. */
2090
                    return HTML5::CDATA;
2091
                break;
2092
2093
                /* A start tag whose tag name is "select" */
2094
                case 'select':
2095
                    /* Reconstruct the active formatting elements, if any. */
2096
                    $this->reconstructActiveFormattingElements();
2097
2098
                    /* Insert an HTML element for the token. */
2099
                    $this->insertElement($token);
2100
2101
                    /* Change the insertion mode to "in select". */
2102
                    $this->mode = self::IN_SELECT;
2103
                break;
2104
2105
                /* A start or end tag whose tag name is one of: "caption", "col",
2106
                "colgroup", "frame", "frameset", "head", "option", "optgroup",
2107
                "tbody", "td", "tfoot", "th", "thead", "tr". */
2108
                case 'caption': case 'col': case 'colgroup': case 'frame':
2109
                case 'frameset': case 'head': case 'option': case 'optgroup':
2110
                case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead':
2111
                case 'tr':
2112
                    // Parse error. Ignore the token.
2113
                break;
2114
2115
                /* A start or end tag whose tag name is one of: "event-source",
2116
                "section", "nav", "article", "aside", "header", "footer",
2117
                "datagrid", "command" */
2118
                case 'event-source': case 'section': case 'nav': case 'article':
2119
                case 'aside': case 'header': case 'footer': case 'datagrid':
2120
                case 'command':
2121
                    // Work in progress!
2122
                break;
2123
2124
                /* A start tag token not covered by the previous entries */
2125
                default:
2126
                    /* Reconstruct the active formatting elements, if any. */
2127
                    $this->reconstructActiveFormattingElements();
2128
2129
                    $this->insertElement($token);
2130
                break;
2131
            }
2132
            break;
2133
2134
            case HTML5::ENDTAG:
2135
            switch($token['name']) {
2136
                /* An end tag with the tag name "body" */
2137
                case 'body':
2138
                    /* If the second element in the stack of open elements is
2139
                    not a body element, this is a parse error. Ignore the token.
2140
                    (innerHTML case) */
2141
                    if(count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2142
                        // Ignore.
2143
2144
                    /* If the current node is not the body element, then this
2145
                    is a parse error. */
2146
                    } elseif(end($this->stack)->nodeName !== 'body') {
2147
                        // Parse error.
2148
                    }
2149
2150
                    /* Change the insertion mode to "after body". */
2151
                    $this->mode = self::AFTER_BODY;
2152
                break;
2153
2154
                /* An end tag with the tag name "html" */
2155
                case 'html':
2156
                    /* Act as if an end tag with tag name "body" had been seen,
2157
                    then, if that token wasn't ignored, reprocess the current
2158
                    token. */
2159
                    $this->inBody(array(
2160
                        'name' => 'body',
2161
                        'type' => HTML5::ENDTAG
2162
                    ));
2163
2164
                    return $this->afterBody($token);
2165
                break;
2166
2167
                /* An end tag whose tag name is one of: "address", "blockquote",
2168
                "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2169
                "ol", "pre", "ul" */
2170
                case 'address': case 'blockquote': case 'center': case 'dir':
2171
                case 'div': case 'dl': case 'fieldset': case 'listing':
2172
                case 'menu': case 'ol': case 'pre': case 'ul':
2173
                    /* If the stack of open elements has an element in scope
2174
                    with the same tag name as that of the token, then generate
2175
                    implied end tags. */
2176
                    if($this->elementInScope($token['name'])) {
2177
                        $this->generateImpliedEndTags();
2178
2179
                        /* Now, if the current node is not an element with
2180
                        the same tag name as that of the token, then this
2181
                        is a parse error. */
2182
                        // w/e
2183
2184
                        /* If the stack of open elements has an element in
2185
                        scope with the same tag name as that of the token,
2186
                        then pop elements from this stack until an element
2187
                        with that tag name has been popped from the stack. */
2188
                        for($n = count($this->stack) - 1; $n >= 0; $n--) {
2189
                            if($this->stack[$n]->nodeName === $token['name']) {
2190
                                $n = -1;
2191
                            }
2192
2193
                            array_pop($this->stack);
2194
                        }
2195
                    }
2196
                break;
2197
2198
                /* An end tag whose tag name is "form" */
2199
                case 'form':
2200
                    /* If the stack of open elements has an element in scope
2201
                    with the same tag name as that of the token, then generate
2202
                    implied    end tags. */
2203
                    if($this->elementInScope($token['name'])) {
2204
                        $this->generateImpliedEndTags();
2205
2206
                    }
2207
2208
                    if(end($this->stack)->nodeName !== $token['name']) {
2209
                        /* Now, if the current node is not an element with the
2210
                        same tag name as that of the token, then this is a parse
2211
                        error. */
2212
                        // w/e
2213
2214
                    } else {
2215
                        /* Otherwise, if the current node is an element with
2216
                        the same tag name as that of the token pop that element
2217
                        from the stack. */
2218
                        array_pop($this->stack);
2219
                    }
2220
2221
                    /* In any case, set the form element pointer to null. */
2222
                    $this->form_pointer = null;
2223
                break;
2224
2225
                /* An end tag whose tag name is "p" */
2226
                case 'p':
2227
                    /* If the stack of open elements has a p element in scope,
2228
                    then generate implied end tags, except for p elements. */
2229
                    if($this->elementInScope('p')) {
2230
                        $this->generateImpliedEndTags(array('p'));
2231
2232
                        /* If the current node is not a p element, then this is
2233
                        a parse error. */
2234
                        // k
2235
2236
                        /* If the stack of open elements has a p element in
2237
                        scope, then pop elements from this stack until the stack
2238
                        no longer has a p element in scope. */
2239
                        for($n = count($this->stack) - 1; $n >= 0; $n--) {
2240
                            if($this->elementInScope('p')) {
2241
                                array_pop($this->stack);
2242
2243
                            } else {
2244
                                break;
2245
                            }
2246
                        }
2247
                    }
2248
                break;
2249
2250
                /* An end tag whose tag name is "dd", "dt", or "li" */
2251
                case 'dd': case 'dt': case 'li':
2252
                    /* If the stack of open elements has an element in scope
2253
                    whose tag name matches the tag name of the token, then
2254
                    generate implied end tags, except for elements with the
2255
                    same tag name as the token. */
2256
                    if($this->elementInScope($token['name'])) {
2257
                        $this->generateImpliedEndTags(array($token['name']));
2258
2259
                        /* If the current node is not an element with the same
2260
                        tag name as the token, then this is a parse error. */
2261
                        // w/e
2262
2263
                        /* If the stack of open elements has an element in scope
2264
                        whose tag name matches the tag name of the token, then
2265
                        pop elements from this stack until an element with that
2266
                        tag name has been popped from the stack. */
2267
                        for($n = count($this->stack) - 1; $n >= 0; $n--) {
2268
                            if($this->stack[$n]->nodeName === $token['name']) {
2269
                                $n = -1;
2270
                            }
2271
2272
                            array_pop($this->stack);
2273
                        }
2274
                    }
2275
                break;
2276
2277
                /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2278
                "h5", "h6" */
2279
                case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
2280
                    $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2281
2282
                    /* If the stack of open elements has in scope an element whose
2283
                    tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2284
                    generate implied end tags. */
2285
                    if($this->elementInScope($elements)) {
2286
                        $this->generateImpliedEndTags();
2287
2288
                        /* Now, if the current node is not an element with the same
2289
                        tag name as that of the token, then this is a parse error. */
2290
                        // w/e
2291
2292
                        /* If the stack of open elements has in scope an element
2293
                        whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2294
                        "h6", then pop elements from the stack until an element
2295
                        with one of those tag names has been popped from the stack. */
2296
                        while($this->elementInScope($elements)) {
2297
                            array_pop($this->stack);
2298
                        }
2299
                    }
2300
                break;
2301
2302
                /* An end tag whose tag name is one of: "a", "b", "big", "em",
2303
                "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2304
                case 'a': case 'b': case 'big': case 'em': case 'font':
2305
                case 'i': case 'nobr': case 's': case 'small': case 'strike':
2306
                case 'strong': case 'tt': case 'u':
2307
                    /* 1. Let the formatting element be the last element in
2308
                    the list of active formatting elements that:
2309
                        * is between the end of the list and the last scope
2310
                        marker in the list, if any, or the start of the list
2311
                        otherwise, and
2312
                        * has the same tag name as the token.
2313
                    */
2314
                    while(true) {
2315
                        for($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
2316
                            if($this->a_formatting[$a] === self::MARKER) {
2317
                                break;
2318
2319
                            } elseif($this->a_formatting[$a]->tagName === $token['name']) {
2320
                                $formatting_element = $this->a_formatting[$a];
2321
                                $in_stack = in_array($formatting_element, $this->stack, true);
2322
                                $fe_af_pos = $a;
2323
                                break;
2324
                            }
2325
                        }
2326
2327
                        /* If there is no such node, or, if that node is
2328
                        also in the stack of open elements but the element
2329
                        is not in scope, then this is a parse error. Abort
2330
                        these steps. The token is ignored. */
2331
                        if(!isset($formatting_element) || ($in_stack &&
2332
                        !$this->elementInScope($token['name']))) {
2333
                            break;
2334
2335
                        /* Otherwise, if there is such a node, but that node
2336
                        is not in the stack of open elements, then this is a
2337
                        parse error; remove the element from the list, and
2338
                        abort these steps. */
2339
                        } elseif(isset($formatting_element) && !$in_stack) {
2340
                            unset($this->a_formatting[$fe_af_pos]);
2341
                            $this->a_formatting = array_merge($this->a_formatting);
2342
                            break;
2343
                        }
2344
2345
                        /* 2. Let the furthest block be the topmost node in the
2346
                        stack of open elements that is lower in the stack
2347
                        than the formatting element, and is not an element in
2348
                        the phrasing or formatting categories. There might
2349
                        not be one. */
2350
                        $fe_s_pos = array_search($formatting_element, $this->stack, true);
2351
                        $length = count($this->stack);
2352
2353
                        for($s = $fe_s_pos + 1; $s < $length; $s++) {
2354
                            $category = $this->getElementCategory($this->stack[$s]->nodeName);
2355
2356
                            if($category !== self::PHRASING && $category !== self::FORMATTING) {
2357
                                $furthest_block = $this->stack[$s];
2358
                            }
2359
                        }
2360
2361
                        /* 3. If there is no furthest block, then the UA must
2362
                        skip the subsequent steps and instead just pop all
2363
                        the nodes from the bottom of the stack of open
2364
                        elements, from the current node up to the formatting
2365
                        element, and remove the formatting element from the
2366
                        list of active formatting elements. */
2367
                        if(!isset($furthest_block)) {
2368
                            for($n = $length - 1; $n >= $fe_s_pos; $n--) {
2369
                                array_pop($this->stack);
2370
                            }
2371
2372
                            unset($this->a_formatting[$fe_af_pos]);
2373
                            $this->a_formatting = array_merge($this->a_formatting);
2374
                            break;
2375
                        }
2376
2377
                        /* 4. Let the common ancestor be the element
2378
                        immediately above the formatting element in the stack
2379
                        of open elements. */
2380
                        $common_ancestor = $this->stack[$fe_s_pos - 1];
2381
2382
                        /* 5. If the furthest block has a parent node, then
2383
                        remove the furthest block from its parent node. */
2384
                        if($furthest_block->parentNode !== null) {
2385
                            $furthest_block->parentNode->removeChild($furthest_block);
2386
                        }
2387
2388
                        /* 6. Let a bookmark note the position of the
2389
                        formatting element in the list of active formatting
2390
                        elements relative to the elements on either side
2391
                        of it in the list. */
2392
                        $bookmark = $fe_af_pos;
2393
2394
                        /* 7. Let node and last node  be the furthest block.
2395
                        Follow these steps: */
2396
                        $node = $furthest_block;
2397
                        $last_node = $furthest_block;
2398
2399
                        while(true) {
2400
                            for($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
2401
                                /* 7.1 Let node be the element immediately
2402
                                prior to node in the stack of open elements. */
2403
                                $node = $this->stack[$n];
2404
2405
                                /* 7.2 If node is not in the list of active
2406
                                formatting elements, then remove node from
2407
                                the stack of open elements and then go back
2408
                                to step 1. */
2409
                                if(!in_array($node, $this->a_formatting, true)) {
2410
                                    unset($this->stack[$n]);
2411
                                    $this->stack = array_merge($this->stack);
2412
2413
                                } else {
2414
                                    break;
2415
                                }
2416
                            }
2417
2418
                            /* 7.3 Otherwise, if node is the formatting
2419
                            element, then go to the next step in the overall
2420
                            algorithm. */
2421
                            if($node === $formatting_element) {
2422
                                break;
2423
2424
                            /* 7.4 Otherwise, if last node is the furthest
2425
                            block, then move the aforementioned bookmark to
2426
                            be immediately after the node in the list of
2427
                            active formatting elements. */
2428
                            } elseif($last_node === $furthest_block) {
2429
                                $bookmark = array_search($node, $this->a_formatting, true) + 1;
2430
                            }
2431
2432
                            /* 7.5 If node has any children, perform a
2433
                            shallow clone of node, replace the entry for
2434
                            node in the list of active formatting elements
2435
                            with an entry for the clone, replace the entry
2436
                            for node in the stack of open elements with an
2437
                            entry for the clone, and let node be the clone. */
2438
                            if($node->hasChildNodes()) {
2439
                                $clone = $node->cloneNode();
2440
                                $s_pos = array_search($node, $this->stack, true);
2441
                                $a_pos = array_search($node, $this->a_formatting, true);
2442
2443
                                $this->stack[$s_pos] = $clone;
2444
                                $this->a_formatting[$a_pos] = $clone;
2445
                                $node = $clone;
2446
                            }
2447
2448
                            /* 7.6 Insert last node into node, first removing
2449
                            it from its previous parent node if any. */
2450
                            if($last_node->parentNode !== null) {
2451
                                $last_node->parentNode->removeChild($last_node);
2452
                            }
2453
2454
                            $node->appendChild($last_node);
2455
2456
                            /* 7.7 Let last node be node. */
2457
                            $last_node = $node;
2458
                        }
2459
2460
                        /* 8. Insert whatever last node ended up being in
2461
                        the previous step into the common ancestor node,
2462
                        first removing it from its previous parent node if
2463
                        any. */
2464
                        if($last_node->parentNode !== null) {
2465
                            $last_node->parentNode->removeChild($last_node);
2466
                        }
2467
2468
                        $common_ancestor->appendChild($last_node);
2469
2470
                        /* 9. Perform a shallow clone of the formatting
2471
                        element. */
2472
                        $clone = $formatting_element->cloneNode();
2473
2474
                        /* 10. Take all of the child nodes of the furthest
2475
                        block and append them to the clone created in the
2476
                        last step. */
2477
                        while($furthest_block->hasChildNodes()) {
2478
                            $child = $furthest_block->firstChild;
2479
                            $furthest_block->removeChild($child);
2480
                            $clone->appendChild($child);
2481
                        }
2482
2483
                        /* 11. Append that clone to the furthest block. */
2484
                        $furthest_block->appendChild($clone);
2485
2486
                        /* 12. Remove the formatting element from the list
2487
                        of active formatting elements, and insert the clone
2488
                        into the list of active formatting elements at the
2489
                        position of the aforementioned bookmark. */
2490
                        $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
2491
                        unset($this->a_formatting[$fe_af_pos]);
2492
                        $this->a_formatting = array_merge($this->a_formatting);
2493
2494
                        $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
2495
                        $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
2496
                        $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
2497
2498
                        /* 13. Remove the formatting element from the stack
2499
                        of open elements, and insert the clone into the stack
2500
                        of open elements immediately after (i.e. in a more
2501
                        deeply nested position than) the position of the
2502
                        furthest block in that stack. */
2503
                        $fe_s_pos = array_search($formatting_element, $this->stack, true);
2504
                        $fb_s_pos = array_search($furthest_block, $this->stack, true);
2505
                        unset($this->stack[$fe_s_pos]);
2506
2507
                        $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
2508
                        $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
2509
                        $this->stack = array_merge($s_part1, array($clone), $s_part2);
2510
2511
                        /* 14. Jump back to step 1 in this series of steps. */
2512
                        unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
2513
                    }
2514
                break;
2515
2516
                /* An end tag token whose tag name is one of: "button",
2517
                "marquee", "object" */
2518
                case 'button': case 'marquee': case 'object':
2519
                    /* If the stack of open elements has an element in scope whose
2520
                    tag name matches the tag name of the token, then generate implied
2521
                    tags. */
2522
                    if($this->elementInScope($token['name'])) {
2523
                        $this->generateImpliedEndTags();
2524
2525
                        /* Now, if the current node is not an element with the same
2526
                        tag name as the token, then this is a parse error. */
2527
                        // k
2528
2529
                        /* Now, if the stack of open elements has an element in scope
2530
                        whose tag name matches the tag name of the token, then pop
2531
                        elements from the stack until that element has been popped from
2532
                        the stack, and clear the list of active formatting elements up
2533
                        to the last marker. */
2534
                        for($n = count($this->stack) - 1; $n >= 0; $n--) {
2535
                            if($this->stack[$n]->nodeName === $token['name']) {
2536
                                $n = -1;
2537
                            }
2538
2539
                            array_pop($this->stack);
2540
                        }
2541
2542
                        $marker = end(array_keys($this->a_formatting, self::MARKER, true));
2543
2544
                        for($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
2545
                            array_pop($this->a_formatting);
2546
                        }
2547
                    }
2548
                break;
2549
2550
                /* Or an end tag whose tag name is one of: "area", "basefont",
2551
                "bgsound", "br", "embed", "hr", "iframe", "image", "img",
2552
                "input", "isindex", "noembed", "noframes", "param", "select",
2553
                "spacer", "table", "textarea", "wbr" */
2554
                case 'area': case 'basefont': case 'bgsound': case 'br':
2555
                case 'embed': case 'hr': case 'iframe': case 'image':
2556
                case 'img': case 'input': case 'isindex': case 'noembed':
2557
                case 'noframes': case 'param': case 'select': case 'spacer':
2558
                case 'table': case 'textarea': case 'wbr':
2559
                    // Parse error. Ignore the token.
2560
                break;
2561
2562
                /* An end tag token not covered by the previous entries */
2563
                default:
2564
                    for($n = count($this->stack) - 1; $n >= 0; $n--) {
2565
                        /* Initialise node to be the current node (the bottommost
2566
                        node of the stack). */
2567
                        $node = end($this->stack);
2568
2569
                        /* If node has the same tag name as the end tag token,
2570
                        then: */
2571
                        if($token['name'] === $node->nodeName) {
2572
                            /* Generate implied end tags. */
2573
                            $this->generateImpliedEndTags();
2574
2575
                            /* If the tag name of the end tag token does not
2576
                            match the tag name of the current node, this is a
2577
                            parse error. */
2578
                            // k
2579
2580
                            /* Pop all the nodes from the current node up to
2581
                            node, including node, then stop this algorithm. */
2582
                            for($x = count($this->stack) - $n; $x >= $n; $x--) {
2583
                                array_pop($this->stack);
2584
                            }
2585
2586
                        } else {
2587
                            $category = $this->getElementCategory($node);
2588
2589
                            if($category !== self::SPECIAL && $category !== self::SCOPING) {
2590
                                /* Otherwise, if node is in neither the formatting
2591
                                category nor the phrasing category, then this is a
2592
                                parse error. Stop this algorithm. The end tag token
2593
                                is ignored. */
2594
                                return false;
2595
                            }
2596
                        }
2597
                    }
2598
                break;
2599
            }
2600
            break;
2601
        }
2602
    }
2603
2604
    private function inTable($token)
2605
    {
2606
        $clear = array('html', 'table');
2607
2608
        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2609
        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2610
        or U+0020 SPACE */
2611
        if($token['type'] === HTML5::CHARACTR &&
2612
        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2613
            /* Append the character to the current node. */
2614
            $text = $this->dom->createTextNode($token['data']);
2615
            end($this->stack)->appendChild($text);
2616
2617
        /* A comment token */
2618
        } elseif($token['type'] === HTML5::COMMENT) {
2619
            /* Append a Comment node to the current node with the data
2620
            attribute set to the data given in the comment token. */
2621
            $comment = $this->dom->createComment($token['data']);
2622
            end($this->stack)->appendChild($comment);
2623
2624
        /* A start tag whose tag name is "caption" */
2625
        } elseif($token['type'] === HTML5::STARTTAG &&
2626
        $token['name'] === 'caption') {
2627
            /* Clear the stack back to a table context. */
2628
            $this->clearStackToTableContext($clear);
2629
2630
            /* Insert a marker at the end of the list of active
2631
            formatting elements. */
2632
            $this->a_formatting[] = self::MARKER;
2633
2634
            /* Insert an HTML element for the token, then switch the
2635
            insertion mode to "in caption". */
2636
            $this->insertElement($token);
2637
            $this->mode = self::IN_CAPTION;
2638
2639
        /* A start tag whose tag name is "colgroup" */
2640
        } elseif($token['type'] === HTML5::STARTTAG &&
2641
        $token['name'] === 'colgroup') {
2642
            /* Clear the stack back to a table context. */
2643
            $this->clearStackToTableContext($clear);
2644
2645
            /* Insert an HTML element for the token, then switch the
2646
            insertion mode to "in column group". */
2647
            $this->insertElement($token);
2648
            $this->mode = self::IN_CGROUP;
2649
2650
        /* A start tag whose tag name is "col" */
2651
        } elseif($token['type'] === HTML5::STARTTAG &&
2652
        $token['name'] === 'col') {
2653
            $this->inTable(array(
2654
                'name' => 'colgroup',
2655
                'type' => HTML5::STARTTAG,
2656
                'attr' => array()
2657
            ));
2658
2659
            $this->inColumnGroup($token);
2660
2661
        /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
2662
        } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2663
        array('tbody', 'tfoot', 'thead'))) {
2664
            /* Clear the stack back to a table context. */
2665
            $this->clearStackToTableContext($clear);
2666
2667
            /* Insert an HTML element for the token, then switch the insertion
2668
            mode to "in table body". */
2669
            $this->insertElement($token);
2670
            $this->mode = self::IN_TBODY;
2671
2672
        /* A start tag whose tag name is one of: "td", "th", "tr" */
2673
        } elseif($token['type'] === HTML5::STARTTAG &&
2674
        in_array($token['name'], array('td', 'th', 'tr'))) {
2675
            /* Act as if a start tag token with the tag name "tbody" had been
2676
            seen, then reprocess the current token. */
2677
            $this->inTable(array(
2678
                'name' => 'tbody',
2679
                'type' => HTML5::STARTTAG,
2680
                'attr' => array()
2681
            ));
2682
2683
            return $this->inTableBody($token);
2684
2685
        /* A start tag whose tag name is "table" */
2686
        } elseif($token['type'] === HTML5::STARTTAG &&
2687
        $token['name'] === 'table') {
2688
            /* Parse error. Act as if an end tag token with the tag name "table"
2689
            had been seen, then, if that token wasn't ignored, reprocess the
2690
            current token. */
2691
            $this->inTable(array(
2692
                'name' => 'table',
2693
                'type' => HTML5::ENDTAG
2694
            ));
2695
2696
            return $this->mainPhase($token);
2697
2698
        /* An end tag whose tag name is "table" */
2699
        } elseif($token['type'] === HTML5::ENDTAG &&
2700
        $token['name'] === 'table') {
2701
            /* If the stack of open elements does not have an element in table
2702
            scope with the same tag name as the token, this is a parse error.
2703
            Ignore the token. (innerHTML case) */
2704
            if(!$this->elementInScope($token['name'], true)) {
2705
                return false;
2706
2707
            /* Otherwise: */
2708
            } else {
2709
                /* Generate implied end tags. */
2710
                $this->generateImpliedEndTags();
2711
2712
                /* Now, if the current node is not a table element, then this
2713
                is a parse error. */
2714
                // w/e
2715
2716
                /* Pop elements from this stack until a table element has been
2717
                popped from the stack. */
2718
                while(true) {
2719
                    $current = end($this->stack)->nodeName;
2720
                    array_pop($this->stack);
2721
2722
                    if($current === 'table') {
2723
                        break;
2724
                    }
2725
                }
2726
2727
                /* Reset the insertion mode appropriately. */
2728
                $this->resetInsertionMode();
2729
            }
2730
2731
        /* An end tag whose tag name is one of: "body", "caption", "col",
2732
        "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2733
        } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2734
        array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td',
2735
        'tfoot', 'th', 'thead', 'tr'))) {
2736
            // Parse error. Ignore the token.
2737
2738
        /* Anything else */
2739
        } else {
2740
            /* Parse error. Process the token as if the insertion mode was "in
2741
            body", with the following exception: */
2742
2743
            /* If the current node is a table, tbody, tfoot, thead, or tr
2744
            element, then, whenever a node would be inserted into the current
2745
            node, it must instead be inserted into the foster parent element. */
2746
            if(in_array(end($this->stack)->nodeName,
2747
            array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
2748
                /* The foster parent element is the parent element of the last
2749
                table element in the stack of open elements, if there is a
2750
                table element and it has such a parent element. If there is no
2751
                table element in the stack of open elements (innerHTML case),
2752
                then the foster parent element is the first element in the
2753
                stack of open elements (the html  element). Otherwise, if there
2754
                is a table element in the stack of open elements, but the last
2755
                table element in the stack of open elements has no parent, or
2756
                its parent node is not an element, then the foster parent
2757
                element is the element before the last table element in the
2758
                stack of open elements. */
2759
                for($n = count($this->stack) - 1; $n >= 0; $n--) {
2760
                    if($this->stack[$n]->nodeName === 'table') {
2761
                        $table = $this->stack[$n];
2762
                        break;
2763
                    }
2764
                }
2765
2766
                if(isset($table) && $table->parentNode !== null) {
2767
                    $this->foster_parent = $table->parentNode;
2768
2769
                } elseif(!isset($table)) {
2770
                    $this->foster_parent = $this->stack[0];
2771
2772
                } elseif(isset($table) && ($table->parentNode === null ||
2773
                $table->parentNode->nodeType !== XML_ELEMENT_NODE)) {
2774
                    $this->foster_parent = $this->stack[$n - 1];
2775
                }
2776
            }
2777
2778
            $this->inBody($token);
2779
        }
2780
    }
2781
2782
    private function inCaption($token)
2783
    {
2784
        /* An end tag whose tag name is "caption" */
2785
        if($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
2786
            /* If the stack of open elements does not have an element in table
2787
            scope with the same tag name as the token, this is a parse error.
2788
            Ignore the token. (innerHTML case) */
2789
            if(!$this->elementInScope($token['name'], true)) {
2790
                // Ignore
2791
2792
            /* Otherwise: */
2793
            } else {
2794
                /* Generate implied end tags. */
2795
                $this->generateImpliedEndTags();
2796
2797
                /* Now, if the current node is not a caption element, then this
2798
                is a parse error. */
2799
                // w/e
2800
2801
                /* Pop elements from this stack until a caption element has
2802
                been popped from the stack. */
2803
                while(true) {
2804
                    $node = end($this->stack)->nodeName;
2805
                    array_pop($this->stack);
2806
2807
                    if($node === 'caption') {
2808
                        break;
2809
                    }
2810
                }
2811
2812
                /* Clear the list of active formatting elements up to the last
2813
                marker. */
2814
                $this->clearTheActiveFormattingElementsUpToTheLastMarker();
2815
2816
                /* Switch the insertion mode to "in table". */
2817
                $this->mode = self::IN_TABLE;
2818
            }
2819
2820
        /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2821
        "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
2822
        name is "table" */
2823
        } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2824
        array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
2825
        'thead', 'tr'))) || ($token['type'] === HTML5::ENDTAG &&
2826
        $token['name'] === 'table')) {
2827
            /* Parse error. Act as if an end tag with the tag name "caption"
2828
            had been seen, then, if that token wasn't ignored, reprocess the
2829
            current token. */
2830
            $this->inCaption(array(
2831
                'name' => 'caption',
2832
                'type' => HTML5::ENDTAG
2833
            ));
2834
2835
            return $this->inTable($token);
2836
2837
        /* An end tag whose tag name is one of: "body", "col", "colgroup",
2838
        "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2839
        } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2840
        array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th',
2841
        'thead', 'tr'))) {
2842
            // Parse error. Ignore the token.
2843
2844
        /* Anything else */
2845
        } else {
2846
            /* Process the token as if the insertion mode was "in body". */
2847
            $this->inBody($token);
2848
        }
2849
    }
2850
2851
    private function inColumnGroup($token)
2852
    {
2853
        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2854
        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2855
        or U+0020 SPACE */
2856
        if($token['type'] === HTML5::CHARACTR &&
2857
        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2858
            /* Append the character to the current node. */
2859
            $text = $this->dom->createTextNode($token['data']);
2860
            end($this->stack)->appendChild($text);
2861
2862
        /* A comment token */
2863
        } elseif($token['type'] === HTML5::COMMENT) {
2864
            /* Append a Comment node to the current node with the data
2865
            attribute set to the data given in the comment token. */
2866
            $comment = $this->dom->createComment($token['data']);
2867
            end($this->stack)->appendChild($comment);
2868
2869
        /* A start tag whose tag name is "col" */
2870
        } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
2871
            /* Insert a col element for the token. Immediately pop the current
2872
            node off the stack of open elements. */
2873
            $this->insertElement($token);
2874
            array_pop($this->stack);
2875
2876
        /* An end tag whose tag name is "colgroup" */
2877
        } elseif($token['type'] === HTML5::ENDTAG &&
2878
        $token['name'] === 'colgroup') {
2879
            /* If the current node is the root html element, then this is a
2880
            parse error, ignore the token. (innerHTML case) */
2881
            if(end($this->stack)->nodeName === 'html') {
2882
                // Ignore
2883
2884
            /* Otherwise, pop the current node (which will be a colgroup
2885
            element) from the stack of open elements. Switch the insertion
2886
            mode to "in table". */
2887
            } else {
2888
                array_pop($this->stack);
2889
                $this->mode = self::IN_TABLE;
2890
            }
2891
2892
        /* An end tag whose tag name is "col" */
2893
        } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
2894
            /* Parse error. Ignore the token. */
2895
2896
        /* Anything else */
2897
        } else {
2898
            /* Act as if an end tag with the tag name "colgroup" had been seen,
2899
            and then, if that token wasn't ignored, reprocess the current token. */
2900
            $this->inColumnGroup(array(
2901
                'name' => 'colgroup',
2902
                'type' => HTML5::ENDTAG
2903
            ));
2904
2905
            return $this->inTable($token);
2906
        }
2907
    }
2908
2909
    private function inTableBody($token)
2910
    {
2911
        $clear = array('tbody', 'tfoot', 'thead', 'html');
2912
2913
        /* A start tag whose tag name is "tr" */
2914
        if($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
2915
            /* Clear the stack back to a table body context. */
2916
            $this->clearStackToTableContext($clear);
2917
2918
            /* Insert a tr element for the token, then switch the insertion
2919
            mode to "in row". */
2920
            $this->insertElement($token);
2921
            $this->mode = self::IN_ROW;
2922
2923
        /* A start tag whose tag name is one of: "th", "td" */
2924
        } elseif($token['type'] === HTML5::STARTTAG &&
2925
        ($token['name'] === 'th' ||    $token['name'] === 'td')) {
2926
            /* Parse error. Act as if a start tag with the tag name "tr" had
2927
            been seen, then reprocess the current token. */
2928
            $this->inTableBody(array(
2929
                'name' => 'tr',
2930
                'type' => HTML5::STARTTAG,
2931
                'attr' => array()
2932
            ));
2933
2934
            return $this->inRow($token);
2935
2936
        /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
2937
        } elseif($token['type'] === HTML5::ENDTAG &&
2938
        in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
2939
            /* If the stack of open elements does not have an element in table
2940
            scope with the same tag name as the token, this is a parse error.
2941
            Ignore the token. */
2942
            if(!$this->elementInScope($token['name'], true)) {
2943
                // Ignore
2944
2945
            /* Otherwise: */
2946
            } else {
2947
                /* Clear the stack back to a table body context. */
2948
                $this->clearStackToTableContext($clear);
2949
2950
                /* Pop the current node from the stack of open elements. Switch
2951
                the insertion mode to "in table". */
2952
                array_pop($this->stack);
2953
                $this->mode = self::IN_TABLE;
2954
            }
2955
2956
        /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2957
        "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
2958
        } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2959
        array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead'))) ||
2960
        ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')) {
2961
            /* If the stack of open elements does not have a tbody, thead, or
2962
            tfoot element in table scope, this is a parse error. Ignore the
2963
            token. (innerHTML case) */
2964
            if(!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
2965
                // Ignore.
2966
2967
            /* Otherwise: */
2968
            } else {
2969
                /* Clear the stack back to a table body context. */
2970
                $this->clearStackToTableContext($clear);
2971
2972
                /* Act as if an end tag with the same tag name as the current
2973
                node ("tbody", "tfoot", or "thead") had been seen, then
2974
                reprocess the current token. */
2975
                $this->inTableBody(array(
2976
                    'name' => end($this->stack)->nodeName,
2977
                    'type' => HTML5::ENDTAG
2978
                ));
2979
2980
                return $this->mainPhase($token);
2981
            }
2982
2983
        /* An end tag whose tag name is one of: "body", "caption", "col",
2984
        "colgroup", "html", "td", "th", "tr" */
2985
        } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2986
        array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
2987
            /* Parse error. Ignore the token. */
2988
2989
        /* Anything else */
2990
        } else {
2991
            /* Process the token as if the insertion mode was "in table". */
2992
            $this->inTable($token);
2993
        }
2994
    }
2995
2996
    private function inRow($token)
2997
    {
2998
        $clear = array('tr', 'html');
2999
3000
        /* A start tag whose tag name is one of: "th", "td" */
3001
        if($token['type'] === HTML5::STARTTAG &&
3002
        ($token['name'] === 'th' || $token['name'] === 'td')) {
3003
            /* Clear the stack back to a table row context. */
3004
            $this->clearStackToTableContext($clear);
3005
3006
            /* Insert an HTML element for the token, then switch the insertion
3007
            mode to "in cell". */
3008
            $this->insertElement($token);
3009
            $this->mode = self::IN_CELL;
3010
3011
            /* Insert a marker at the end of the list of active formatting
3012
            elements. */
3013
            $this->a_formatting[] = self::MARKER;
3014
3015
        /* An end tag whose tag name is "tr" */
3016
        } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
3017
            /* If the stack of open elements does not have an element in table
3018
            scope with the same tag name as the token, this is a parse error.
3019
            Ignore the token. (innerHTML case) */
3020
            if(!$this->elementInScope($token['name'], true)) {
3021
                // Ignore.
3022
3023
            /* Otherwise: */
3024
            } else {
3025
                /* Clear the stack back to a table row context. */
3026
                $this->clearStackToTableContext($clear);
3027
3028
                /* Pop the current node (which will be a tr element) from the
3029
                stack of open elements. Switch the insertion mode to "in table
3030
                body". */
3031
                array_pop($this->stack);
3032
                $this->mode = self::IN_TBODY;
3033
            }
3034
3035
        /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3036
        "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3037
        } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3038
        array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) {
3039
            /* Act as if an end tag with the tag name "tr" had been seen, then,
3040
            if that token wasn't ignored, reprocess the current token. */
3041
            $this->inRow(array(
3042
                'name' => 'tr',
3043
                'type' => HTML5::ENDTAG
3044
            ));
3045
3046
            return $this->inCell($token);
3047
3048
        /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3049
        } elseif($token['type'] === HTML5::ENDTAG &&
3050
        in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
3051
            /* If the stack of open elements does not have an element in table
3052
            scope with the same tag name as the token, this is a parse error.
3053
            Ignore the token. */
3054
            if(!$this->elementInScope($token['name'], true)) {
3055
                // Ignore.
3056
3057
            /* Otherwise: */
3058
            } else {
3059
                /* Otherwise, act as if an end tag with the tag name "tr" had
3060
                been seen, then reprocess the current token. */
3061
                $this->inRow(array(
3062
                    'name' => 'tr',
3063
                    'type' => HTML5::ENDTAG
3064
                ));
3065
3066
                return $this->inCell($token);
3067
            }
3068
3069
        /* An end tag whose tag name is one of: "body", "caption", "col",
3070
        "colgroup", "html", "td", "th" */
3071
        } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3072
        array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3073
            /* Parse error. Ignore the token. */
3074
3075
        /* Anything else */
3076
        } else {
3077
            /* Process the token as if the insertion mode was "in table". */
3078
            $this->inTable($token);
3079
        }
3080
    }
3081
3082
    private function inCell($token)
3083
    {
3084
        /* An end tag whose tag name is one of: "td", "th" */
3085
        if($token['type'] === HTML5::ENDTAG &&
3086
        ($token['name'] === 'td' || $token['name'] === 'th')) {
3087
            /* If the stack of open elements does not have an element in table
3088
            scope with the same tag name as that of the token, then this is a
3089
            parse error and the token must be ignored. */
3090
            if(!$this->elementInScope($token['name'], true)) {
3091
                // Ignore.
3092
3093
            /* Otherwise: */
3094
            } else {
3095
                /* Generate implied end tags, except for elements with the same
3096
                tag name as the token. */
3097
                $this->generateImpliedEndTags(array($token['name']));
3098
3099
                /* Now, if the current node is not an element with the same tag
3100
                name as the token, then this is a parse error. */
3101
                // k
3102
3103
                /* Pop elements from this stack until an element with the same
3104
                tag name as the token has been popped from the stack. */
3105
                while(true) {
3106
                    $node = end($this->stack)->nodeName;
3107
                    array_pop($this->stack);
3108
3109
                    if($node === $token['name']) {
3110
                        break;
3111
                    }
3112
                }
3113
3114
                /* Clear the list of active formatting elements up to the last
3115
                marker. */
3116
                $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3117
3118
                /* Switch the insertion mode to "in row". (The current node
3119
                will be a tr element at this point.) */
3120
                $this->mode = self::IN_ROW;
3121
            }
3122
3123
        /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3124
        "tbody", "td", "tfoot", "th", "thead", "tr" */
3125
        } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3126
        array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3127
        'thead', 'tr'))) {
3128
            /* If the stack of open elements does not have a td or th element
3129
            in table scope, then this is a parse error; ignore the token.
3130
            (innerHTML case) */
3131
            if(!$this->elementInScope(array('td', 'th'), true)) {
3132
                // Ignore.
3133
3134
            /* Otherwise, close the cell (see below) and reprocess the current
3135
            token. */
3136
            } else {
3137
                $this->closeCell();
3138
                return $this->inRow($token);
3139
            }
3140
3141
        /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3142
        "tbody", "td", "tfoot", "th", "thead", "tr" */
3143
        } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3144
        array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3145
        'thead', 'tr'))) {
3146
            /* If the stack of open elements does not have a td or th element
3147
            in table scope, then this is a parse error; ignore the token.
3148
            (innerHTML case) */
3149
            if(!$this->elementInScope(array('td', 'th'), true)) {
3150
                // Ignore.
3151
3152
            /* Otherwise, close the cell (see below) and reprocess the current
3153
            token. */
3154
            } else {
3155
                $this->closeCell();
3156
                return $this->inRow($token);
3157
            }
3158
3159
        /* An end tag whose tag name is one of: "body", "caption", "col",
3160
        "colgroup", "html" */
3161
        } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3162
        array('body', 'caption', 'col', 'colgroup', 'html'))) {
3163
            /* Parse error. Ignore the token. */
3164
3165
        /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3166
        "thead", "tr" */
3167
        } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3168
        array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
3169
            /* If the stack of open elements does not have an element in table
3170
            scope with the same tag name as that of the token (which can only
3171
            happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
3172
            then this is a parse error and the token must be ignored. */
3173
            if(!$this->elementInScope($token['name'], true)) {
3174
                // Ignore.
3175
3176
            /* Otherwise, close the cell (see below) and reprocess the current
3177
            token. */
3178
            } else {
3179
                $this->closeCell();
3180
                return $this->inRow($token);
3181
            }
3182
3183
        /* Anything else */
3184
        } else {
3185
            /* Process the token as if the insertion mode was "in body". */
3186
            $this->inBody($token);
3187
        }
3188
    }
3189
3190
    private function inSelect($token)
3191
    {
3192
        /* Handle the token as follows: */
3193
3194
        /* A character token */
3195
        if($token['type'] === HTML5::CHARACTR) {
3196
            /* Append the token's character to the current node. */
3197
            $this->insertText($token['data']);
3198
3199
        /* A comment token */
3200
        } elseif($token['type'] === HTML5::COMMENT) {
3201
            /* Append a Comment node to the current node with the data
3202
            attribute set to the data given in the comment token. */
3203
            $this->insertComment($token['data']);
3204
3205
        /* A start tag token whose tag name is "option" */
3206
        } elseif($token['type'] === HTML5::STARTTAG &&
3207
        $token['name'] === 'option') {
3208
            /* If the current node is an option element, act as if an end tag
3209
            with the tag name "option" had been seen. */
3210
            if(end($this->stack)->nodeName === 'option') {
3211
                $this->inSelect(array(
3212
                    'name' => 'option',
3213
                    'type' => HTML5::ENDTAG
3214
                ));
3215
            }
3216
3217
            /* Insert an HTML element for the token. */
3218
            $this->insertElement($token);
3219
3220
        /* A start tag token whose tag name is "optgroup" */
3221
        } elseif($token['type'] === HTML5::STARTTAG &&
3222
        $token['name'] === 'optgroup') {
3223
            /* If the current node is an option element, act as if an end tag
3224
            with the tag name "option" had been seen. */
3225
            if(end($this->stack)->nodeName === 'option') {
3226
                $this->inSelect(array(
3227
                    'name' => 'option',
3228
                    'type' => HTML5::ENDTAG
3229
                ));
3230
            }
3231
3232
            /* If the current node is an optgroup element, act as if an end tag
3233
            with the tag name "optgroup" had been seen. */
3234
            if(end($this->stack)->nodeName === 'optgroup') {
3235
                $this->inSelect(array(
3236
                    'name' => 'optgroup',
3237
                    'type' => HTML5::ENDTAG
3238
                ));
3239
            }
3240
3241
            /* Insert an HTML element for the token. */
3242
            $this->insertElement($token);
3243
3244
        /* An end tag token whose tag name is "optgroup" */
3245
        } elseif($token['type'] === HTML5::ENDTAG &&
3246
        $token['name'] === 'optgroup') {
3247
            /* First, if the current node is an option element, and the node
3248
            immediately before it in the stack of open elements is an optgroup
3249
            element, then act as if an end tag with the tag name "option" had
3250
            been seen. */
3251
            $elements_in_stack = count($this->stack);
3252
3253
            if($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
3254
            $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup') {
3255
                $this->inSelect(array(
3256
                    'name' => 'option',
3257
                    'type' => HTML5::ENDTAG
3258
                ));
3259
            }
3260
3261
            /* If the current node is an optgroup element, then pop that node
3262
            from the stack of open elements. Otherwise, this is a parse error,
3263
            ignore the token. */
3264
            if($this->stack[$elements_in_stack - 1] === 'optgroup') {
3265
                array_pop($this->stack);
3266
            }
3267
3268
        /* An end tag token whose tag name is "option" */
3269
        } elseif($token['type'] === HTML5::ENDTAG &&
3270
        $token['name'] === 'option') {
3271
            /* If the current node is an option element, then pop that node
3272
            from the stack of open elements. Otherwise, this is a parse error,
3273
            ignore the token. */
3274
            if(end($this->stack)->nodeName === 'option') {
3275
                array_pop($this->stack);
3276
            }
3277
3278
        /* An end tag whose tag name is "select" */
3279
        } elseif($token['type'] === HTML5::ENDTAG &&
3280
        $token['name'] === 'select') {
3281
            /* If the stack of open elements does not have an element in table
3282
            scope with the same tag name as the token, this is a parse error.
3283
            Ignore the token. (innerHTML case) */
3284
            if(!$this->elementInScope($token['name'], true)) {
3285
                // w/e
3286
3287
            /* Otherwise: */
3288
            } else {
3289
                /* Pop elements from the stack of open elements until a select
3290
                element has been popped from the stack. */
3291
                while(true) {
3292
                    $current = end($this->stack)->nodeName;
3293
                    array_pop($this->stack);
3294
3295
                    if($current === 'select') {
3296
                        break;
3297
                    }
3298
                }
3299
3300
                /* Reset the insertion mode appropriately. */
3301
                $this->resetInsertionMode();
3302
            }
3303
3304
        /* A start tag whose tag name is "select" */
3305
        } elseif($token['name'] === 'select' &&
3306
        $token['type'] === HTML5::STARTTAG) {
3307
            /* Parse error. Act as if the token had been an end tag with the
3308
            tag name "select" instead. */
3309
            $this->inSelect(array(
3310
                'name' => 'select',
3311
                'type' => HTML5::ENDTAG
3312
            ));
3313
3314
        /* An end tag whose tag name is one of: "caption", "table", "tbody",
3315
        "tfoot", "thead", "tr", "td", "th" */
3316
        } elseif(in_array($token['name'], array('caption', 'table', 'tbody',
3317
        'tfoot', 'thead', 'tr', 'td', 'th')) && $token['type'] === HTML5::ENDTAG) {
3318
            /* Parse error. */
3319
            // w/e
3320
3321
            /* If the stack of open elements has an element in table scope with
3322
            the same tag name as that of the token, then act as if an end tag
3323
            with the tag name "select" had been seen, and reprocess the token.
3324
            Otherwise, ignore the token. */
3325
            if($this->elementInScope($token['name'], true)) {
3326
                $this->inSelect(array(
3327
                    'name' => 'select',
3328
                    'type' => HTML5::ENDTAG
3329
                ));
3330
3331
                $this->mainPhase($token);
3332
            }
3333
3334
        /* Anything else */
3335
        } else {
3336
            /* Parse error. Ignore the token. */
3337
        }
3338
    }
3339
3340
    private function afterBody($token)
3341
    {
3342
        /* Handle the token as follows: */
3343
3344
        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3345
        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3346
        or U+0020 SPACE */
3347
        if($token['type'] === HTML5::CHARACTR &&
3348
        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3349
            /* Process the token as it would be processed if the insertion mode
3350
            was "in body". */
3351
            $this->inBody($token);
3352
3353
        /* A comment token */
3354
        } elseif($token['type'] === HTML5::COMMENT) {
3355
            /* Append a Comment node to the first element in the stack of open
3356
            elements (the html element), with the data attribute set to the
3357
            data given in the comment token. */
3358
            $comment = $this->dom->createComment($token['data']);
3359
            $this->stack[0]->appendChild($comment);
3360
3361
        /* An end tag with the tag name "html" */
3362
        } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
3363
            /* If the parser was originally created in order to handle the
3364
            setting of an element's innerHTML attribute, this is a parse error;
3365
            ignore the token. (The element will be an html element in this
3366
            case.) (innerHTML case) */
3367
3368
            /* Otherwise, switch to the trailing end phase. */
3369
            $this->phase = self::END_PHASE;
3370
3371
        /* Anything else */
3372
        } else {
3373
            /* Parse error. Set the insertion mode to "in body" and reprocess
3374
            the token. */
3375
            $this->mode = self::IN_BODY;
3376
            return $this->inBody($token);
3377
        }
3378
    }
3379
3380
    private function inFrameset($token)
3381
    {
3382
        /* Handle the token as follows: */
3383
3384
        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3385
        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3386
        U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3387
        if($token['type'] === HTML5::CHARACTR &&
3388
        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3389
            /* Append the character to the current node. */
3390
            $this->insertText($token['data']);
3391
3392
        /* A comment token */
3393
        } elseif($token['type'] === HTML5::COMMENT) {
3394
            /* Append a Comment node to the current node with the data
3395
            attribute set to the data given in the comment token. */
3396
            $this->insertComment($token['data']);
3397
3398
        /* A start tag with the tag name "frameset" */
3399
        } elseif($token['name'] === 'frameset' &&
3400
        $token['type'] === HTML5::STARTTAG) {
3401
            $this->insertElement($token);
3402
3403
        /* An end tag with the tag name "frameset" */
3404
        } elseif($token['name'] === 'frameset' &&
3405
        $token['type'] === HTML5::ENDTAG) {
3406
            /* If the current node is the root html element, then this is a
3407
            parse error; ignore the token. (innerHTML case) */
3408
            if(end($this->stack)->nodeName === 'html') {
3409
                // Ignore
3410
3411
            } else {
3412
                /* Otherwise, pop the current node from the stack of open
3413
                elements. */
3414
                array_pop($this->stack);
3415
3416
                /* If the parser was not originally created in order to handle
3417
                the setting of an element's innerHTML attribute (innerHTML case),
3418
                and the current node is no longer a frameset element, then change
3419
                the insertion mode to "after frameset". */
3420
                $this->mode = self::AFTR_FRAME;
3421
            }
3422
3423
        /* A start tag with the tag name "frame" */
3424
        } elseif($token['name'] === 'frame' &&
3425
        $token['type'] === HTML5::STARTTAG) {
3426
            /* Insert an HTML element for the token. */
3427
            $this->insertElement($token);
3428
3429
            /* Immediately pop the current node off the stack of open elements. */
3430
            array_pop($this->stack);
3431
3432
        /* A start tag with the tag name "noframes" */
3433
        } elseif($token['name'] === 'noframes' &&
3434
        $token['type'] === HTML5::STARTTAG) {
3435
            /* Process the token as if the insertion mode had been "in body". */
3436
            $this->inBody($token);
3437
3438
        /* Anything else */
3439
        } else {
3440
            /* Parse error. Ignore the token. */
3441
        }
3442
    }
3443
3444
    private function afterFrameset($token)
3445
    {
3446
        /* Handle the token as follows: */
3447
3448
        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3449
        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3450
        U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3451
        if($token['type'] === HTML5::CHARACTR &&
3452
        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3453
            /* Append the character to the current node. */
3454
            $this->insertText($token['data']);
3455
3456
        /* A comment token */
3457
        } elseif($token['type'] === HTML5::COMMENT) {
3458
            /* Append a Comment node to the current node with the data
3459
            attribute set to the data given in the comment token. */
3460
            $this->insertComment($token['data']);
3461
3462
        /* An end tag with the tag name "html" */
3463
        } elseif($token['name'] === 'html' &&
3464
        $token['type'] === HTML5::ENDTAG) {
3465
            /* Switch to the trailing end phase. */
3466
            $this->phase = self::END_PHASE;
3467
3468
        /* A start tag with the tag name "noframes" */
3469
        } elseif($token['name'] === 'noframes' &&
3470
        $token['type'] === HTML5::STARTTAG) {
3471
            /* Process the token as if the insertion mode had been "in body". */
3472
            $this->inBody($token);
3473
3474
        /* Anything else */
3475
        } else {
3476
            /* Parse error. Ignore the token. */
3477
        }
3478
    }
3479
3480
    private function trailingEndPhase($token)
3481
    {
3482
        /* After the main phase, as each token is emitted from the tokenisation
3483
        stage, it must be processed as described in this section. */
3484
3485
        /* A DOCTYPE token */
3486
        if($token['type'] === HTML5::DOCTYPE) {
3487
            // Parse error. Ignore the token.
3488
3489
        /* A comment token */
3490
        } elseif($token['type'] === HTML5::COMMENT) {
3491
            /* Append a Comment node to the Document object with the data
3492
            attribute set to the data given in the comment token. */
3493
            $comment = $this->dom->createComment($token['data']);
3494
            $this->dom->appendChild($comment);
3495
3496
        /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3497
        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3498
        or U+0020 SPACE */
3499
        } elseif($token['type'] === HTML5::CHARACTR &&
3500
        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3501
            /* Process the token as it would be processed in the main phase. */
3502
            $this->mainPhase($token);
3503
3504
        /* A character token that is not one of U+0009 CHARACTER TABULATION,
3505
        U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3506
        or U+0020 SPACE. Or a start tag token. Or an end tag token. */
3507
        } elseif(($token['type'] === HTML5::CHARACTR &&
3508
        preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
3509
        $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG) {
3510
            /* Parse error. Switch back to the main phase and reprocess the
3511
            token. */
3512
            $this->phase = self::MAIN_PHASE;
3513
            return $this->mainPhase($token);
3514
3515
        /* An end-of-file token */
3516
        } elseif($token['type'] === HTML5::EOF) {
3517
            /* OMG DONE!! */
3518
        }
3519
    }
3520
3521
    private function insertElement($token, $append = true)
3522
    {
3523
        $el = $this->dom->createElement($token['name']);
3524
3525
        foreach($token['attr'] as $attr) {
3526
            if(!$el->hasAttribute($attr['name'])) {
3527
                $el->setAttribute($attr['name'], $attr['value']);
3528
            }
3529
        }
3530
3531
        $this->appendToRealParent($el);
3532
        $this->stack[] = $el;
3533
3534
        return $el;
3535
    }
3536
3537
    private function insertText($data)
3538
    {
3539
        $text = $this->dom->createTextNode($data);
3540
        $this->appendToRealParent($text);
3541
    }
3542
3543
    private function insertComment($data)
3544
    {
3545
        $comment = $this->dom->createComment($data);
3546
        $this->appendToRealParent($comment);
3547
    }
3548
3549
    private function appendToRealParent($node)
3550
    {
3551
        if($this->foster_parent === null) {
3552
            end($this->stack)->appendChild($node);
3553
3554
        } elseif($this->foster_parent !== null) {
3555
            /* If the foster parent element is the parent element of the
3556
            last table element in the stack of open elements, then the new
3557
            node must be inserted immediately before the last table element
3558
            in the stack of open elements in the foster parent element;
3559
            otherwise, the new node must be appended to the foster parent
3560
            element. */
3561
            for($n = count($this->stack) - 1; $n >= 0; $n--) {
3562
                if($this->stack[$n]->nodeName === 'table' &&
3563
                $this->stack[$n]->parentNode !== null) {
3564
                    $table = $this->stack[$n];
3565
                    break;
3566
                }
3567
            }
3568
3569
            if(isset($table) && $this->foster_parent->isSameNode($table->parentNode))
3570
                $this->foster_parent->insertBefore($node, $table);
3571
            else
3572
                $this->foster_parent->appendChild($node);
3573
3574
            $this->foster_parent = null;
3575
        }
3576
    }
3577
3578
    private function elementInScope($el, $table = false)
3579
    {
3580
        if(is_array($el)) {
3581
            foreach($el as $element) {
3582
                if($this->elementInScope($element, $table)) {
3583
                    return true;
3584
                }
3585
            }
3586
3587
            return false;
3588
        }
3589
3590
        $leng = count($this->stack);
3591
3592
        for($n = 0; $n < $leng; $n++) {
3593
            /* 1. Initialise node to be the current node (the bottommost node of
3594
            the stack). */
3595
            $node = $this->stack[$leng - 1 - $n];
3596
3597
            if($node->tagName === $el) {
3598
                /* 2. If node is the target node, terminate in a match state. */
3599
                return true;
3600
3601
            } elseif($node->tagName === 'table') {
3602
                /* 3. Otherwise, if node is a table element, terminate in a failure
3603
                state. */
3604
                return false;
3605
3606
            } elseif($table === true && in_array($node->tagName, array('caption', 'td',
3607
            'th', 'button', 'marquee', 'object'))) {
3608
                /* 4. Otherwise, if the algorithm is the "has an element in scope"
3609
                variant (rather than the "has an element in table scope" variant),
3610
                and node is one of the following, terminate in a failure state. */
3611
                return false;
3612
3613
            } elseif($node === $node->ownerDocument->documentElement) {
3614
                /* 5. Otherwise, if node is an html element (root element), terminate
3615
                in a failure state. (This can only happen if the node is the topmost
3616
                node of the    stack of open elements, and prevents the next step from
3617
                being invoked if there are no more elements in the stack.) */
3618
                return false;
3619
            }
3620
3621
            /* Otherwise, set node to the previous entry in the stack of open
3622
            elements and return to step 2. (This will never fail, since the loop
3623
            will always terminate in the previous step if the top of the stack
3624
            is reached.) */
3625
        }
3626
    }
3627
3628
    private function reconstructActiveFormattingElements()
3629
    {
3630
        /* 1. If there are no entries in the list of active formatting elements,
3631
        then there is nothing to reconstruct; stop this algorithm. */
3632
        $formatting_elements = count($this->a_formatting);
3633
3634
        if($formatting_elements === 0) {
3635
            return false;
3636
        }
3637
3638
        /* 3. Let entry be the last (most recently added) element in the list
3639
        of active formatting elements. */
3640
        $entry = end($this->a_formatting);
3641
3642
        /* 2. If the last (most recently added) entry in the list of active
3643
        formatting elements is a marker, or if it is an element that is in the
3644
        stack of open elements, then there is nothing to reconstruct; stop this
3645
        algorithm. */
3646
        if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3647
            return false;
3648
        }
3649
3650
        for($a = $formatting_elements - 1; $a >= 0; true) {
3651
            /* 4. If there are no entries before entry in the list of active
3652
            formatting elements, then jump to step 8. */
3653
            if($a === 0) {
3654
                $step_seven = false;
3655
                break;
3656
            }
3657
3658
            /* 5. Let entry be the entry one earlier than entry in the list of
3659
            active formatting elements. */
3660
            $a--;
3661
            $entry = $this->a_formatting[$a];
3662
3663
            /* 6. If entry is neither a marker nor an element that is also in
3664
            thetack of open elements, go to step 4. */
3665
            if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3666
                break;
3667
            }
3668
        }
3669
3670
        while(true) {
3671
            /* 7. Let entry be the element one later than entry in the list of
3672
            active formatting elements. */
3673
            if(isset($step_seven) && $step_seven === true) {
3674
                $a++;
3675
                $entry = $this->a_formatting[$a];
3676
            }
3677
3678
            /* 8. Perform a shallow clone of the element entry to obtain clone. */
3679
            $clone = $entry->cloneNode();
3680
3681
            /* 9. Append clone to the current node and push it onto the stack
3682
            of open elements  so that it is the new current node. */
3683
            end($this->stack)->appendChild($clone);
3684
            $this->stack[] = $clone;
3685
3686
            /* 10. Replace the entry for entry in the list with an entry for
3687
            clone. */
3688
            $this->a_formatting[$a] = $clone;
3689
3690
            /* 11. If the entry for clone in the list of active formatting
3691
            elements is not the last entry in the list, return to step 7. */
3692
            if(end($this->a_formatting) !== $clone) {
3693
                $step_seven = true;
3694
            } else {
3695
                break;
3696
            }
3697
        }
3698
    }
3699
3700
    private function clearTheActiveFormattingElementsUpToTheLastMarker()
3701
    {
3702
        /* When the steps below require the UA to clear the list of active
3703
        formatting elements up to the last marker, the UA must perform the
3704
        following steps: */
3705
3706
        while(true) {
3707
            /* 1. Let entry be the last (most recently added) entry in the list
3708
            of active formatting elements. */
3709
            $entry = end($this->a_formatting);
3710
3711
            /* 2. Remove entry from the list of active formatting elements. */
3712
            array_pop($this->a_formatting);
3713
3714
            /* 3. If entry was a marker, then stop the algorithm at this point.
3715
            The list has been cleared up to the last marker. */
3716
            if($entry === self::MARKER) {
3717
                break;
3718
            }
3719
        }
3720
    }
3721
3722
    private function generateImpliedEndTags(array $exclude = array())
3723
    {
3724
        /* When the steps below require the UA to generate implied end tags,
3725
        then, if the current node is a dd element, a dt element, an li element,
3726
        a p element, a td element, a th  element, or a tr element, the UA must
3727
        act as if an end tag with the respective tag name had been seen and
3728
        then generate implied end tags again. */
3729
        $node = end($this->stack);
3730
        $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
3731
3732
        while(in_array(end($this->stack)->nodeName, $elements)) {
3733
            array_pop($this->stack);
3734
        }
3735
    }
3736
3737
    private function getElementCategory($name)
3738
    {
3739
        if(in_array($name, $this->special))
3740
            return self::SPECIAL;
3741
3742
        elseif(in_array($name, $this->scoping))
3743
            return self::SCOPING;
3744
3745
        elseif(in_array($name, $this->formatting))
3746
            return self::FORMATTING;
3747
3748
        else
3749
            return self::PHRASING;
3750
    }
3751
3752
    private function clearStackToTableContext($elements)
3753
    {
3754
        /* When the steps above require the UA to clear the stack back to a
3755
        table context, it means that the UA must, while the current node is not
3756
        a table element or an html element, pop elements from the stack of open
3757
        elements. If this causes any elements to be popped from the stack, then
3758
        this is a parse error. */
3759
        while(true) {
3760
            $node = end($this->stack)->nodeName;
3761
3762
            if(in_array($node, $elements)) {
3763
                break;
3764
            } else {
3765
                array_pop($this->stack);
3766
            }
3767
        }
3768
    }
3769
3770
    private function resetInsertionMode()
3771
    {
3772
        /* 1. Let last be false. */
3773
        $last = false;
3774
        $leng = count($this->stack);
3775
3776
        for($n = $leng - 1; $n >= 0; $n--) {
3777
            /* 2. Let node be the last node in the stack of open elements. */
3778
            $node = $this->stack[$n];
3779
3780
            /* 3. If node is the first node in the stack of open elements, then
3781
            set last to true. If the element whose innerHTML  attribute is being
3782
            set is neither a td  element nor a th element, then set node to the
3783
            element whose innerHTML  attribute is being set. (innerHTML  case) */
3784
            if($this->stack[0]->isSameNode($node)) {
3785
                $last = true;
3786
            }
3787
3788
            /* 4. If node is a select element, then switch the insertion mode to
3789
            "in select" and abort these steps. (innerHTML case) */
3790
            if($node->nodeName === 'select') {
3791
                $this->mode = self::IN_SELECT;
3792
                break;
3793
3794
            /* 5. If node is a td or th element, then switch the insertion mode
3795
            to "in cell" and abort these steps. */
3796
            } elseif($node->nodeName === 'td' || $node->nodeName === 'th') {
3797
                $this->mode = self::IN_CELL;
3798
                break;
3799
3800
            /* 6. If node is a tr element, then switch the insertion mode to
3801
            "in    row" and abort these steps. */
3802
            } elseif($node->nodeName === 'tr') {
3803
                $this->mode = self::IN_ROW;
3804
                break;
3805
3806
            /* 7. If node is a tbody, thead, or tfoot element, then switch the
3807
            insertion mode to "in table body" and abort these steps. */
3808
            } elseif(in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {
3809
                $this->mode = self::IN_TBODY;
3810
                break;
3811
3812
            /* 8. If node is a caption element, then switch the insertion mode
3813
            to "in caption" and abort these steps. */
3814
            } elseif($node->nodeName === 'caption') {
3815
                $this->mode = self::IN_CAPTION;
3816
                break;
3817
3818
            /* 9. If node is a colgroup element, then switch the insertion mode
3819
            to "in column group" and abort these steps. (innerHTML case) */
3820
            } elseif($node->nodeName === 'colgroup') {
3821
                $this->mode = self::IN_CGROUP;
3822
                break;
3823
3824
            /* 10. If node is a table element, then switch the insertion mode
3825
            to "in table" and abort these steps. */
3826
            } elseif($node->nodeName === 'table') {
3827
                $this->mode = self::IN_TABLE;
3828
                break;
3829
3830
            /* 11. If node is a head element, then switch the insertion mode
3831
            to "in body" ("in body"! not "in head"!) and abort these steps.
3832
            (innerHTML case) */
3833
            } elseif($node->nodeName === 'head') {
3834
                $this->mode = self::IN_BODY;
3835
                break;
3836
3837
            /* 12. If node is a body element, then switch the insertion mode to
3838
            "in body" and abort these steps. */
3839
            } elseif($node->nodeName === 'body') {
3840
                $this->mode = self::IN_BODY;
3841
                break;
3842
3843
            /* 13. If node is a frameset element, then switch the insertion
3844
            mode to "in frameset" and abort these steps. (innerHTML case) */
3845
            } elseif($node->nodeName === 'frameset') {
3846
                $this->mode = self::IN_FRAME;
3847
                break;
3848
3849
            /* 14. If node is an html element, then: if the head element
3850
            pointer is null, switch the insertion mode to "before head",
3851
            otherwise, switch the insertion mode to "after head". In either
3852
            case, abort these steps. (innerHTML case) */
3853
            } elseif($node->nodeName === 'html') {
3854
                $this->mode = ($this->head_pointer === null)
3855
                    ? self::BEFOR_HEAD
3856
                    : self::AFTER_HEAD;
3857
3858
                break;
3859
3860
            /* 15. If last is true, then set the insertion mode to "in body"
3861
            and    abort these steps. (innerHTML case) */
3862
            } elseif($last) {
3863
                $this->mode = self::IN_BODY;
3864
                break;
3865
            }
3866
        }
3867
    }
3868
3869
    private function closeCell()
3870
    {
3871
        /* If the stack of open elements has a td or th element in table scope,
3872
        then act as if an end tag token with that tag name had been seen. */
3873
        foreach(array('td', 'th') as $cell) {
3874
            if($this->elementInScope($cell, true)) {
3875
                $this->inCell(array(
3876
                    'name' => $cell,
3877
                    'type' => HTML5::ENDTAG
3878
                ));
3879
3880
                break;
3881
            }
3882
        }
3883
    }
3884
3885
    public function save()
3886
    {
3887
        return $this->dom;
3888
    }
3889
}
3890