1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/* |
4
|
|
|
|
5
|
|
|
Copyright 2007 Jeroen van der Meer <http://jero.net/> |
6
|
|
|
Copyright 2008 Edward Z. Yang <http://htmlpurifier.org/> |
7
|
|
|
Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/> |
8
|
|
|
|
9
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a |
10
|
|
|
copy of this software and associated documentation files (the |
11
|
|
|
"Software"), to deal in the Software without restriction, including |
12
|
|
|
without limitation the rights to use, copy, modify, merge, publish, |
13
|
|
|
distribute, sublicense, and/or sell copies of the Software, and to |
14
|
|
|
permit persons to whom the Software is furnished to do so, subject to |
15
|
|
|
the following conditions: |
16
|
|
|
|
17
|
|
|
The above copyright notice and this permission notice shall be included |
18
|
|
|
in all copies or substantial portions of the Software. |
19
|
|
|
|
20
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
21
|
|
|
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
22
|
|
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
23
|
|
|
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
24
|
|
|
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
25
|
|
|
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
26
|
|
|
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
27
|
|
|
|
28
|
|
|
*/ |
29
|
|
|
|
30
|
|
|
// Some conventions: |
31
|
|
|
// /* */ indicates verbatim text from the HTML 5 specification |
32
|
|
|
// // indicates regular comments |
33
|
|
|
|
34
|
|
|
// all flags are in hyphenated form |
35
|
|
|
|
36
|
|
|
class HTML5_Tokenizer { |
37
|
|
|
/** |
38
|
|
|
* Points to an InputStream object. |
39
|
|
|
*/ |
40
|
|
|
protected $stream; |
41
|
|
|
|
42
|
|
|
/** |
43
|
|
|
* Tree builder that the tokenizer emits token to. |
44
|
|
|
*/ |
45
|
|
|
private $tree; |
46
|
|
|
|
47
|
|
|
/** |
48
|
|
|
* Current content model we are parsing as. |
49
|
|
|
*/ |
50
|
|
|
protected $content_model; |
51
|
|
|
|
52
|
|
|
/** |
53
|
|
|
* Current token that is being built, but not yet emitted. Also |
54
|
|
|
* is the last token emitted, if applicable. |
55
|
|
|
*/ |
56
|
|
|
protected $token; |
57
|
|
|
|
58
|
|
|
// These are constants describing the content model |
59
|
|
|
const PCDATA = 0; |
60
|
|
|
const RCDATA = 1; |
61
|
|
|
const CDATA = 2; |
62
|
|
|
const PLAINTEXT = 3; |
63
|
|
|
|
64
|
|
|
// These are constants describing tokens |
65
|
|
|
// XXX should probably be moved somewhere else, probably the |
66
|
|
|
// HTML5 class. |
67
|
|
|
const DOCTYPE = 0; |
68
|
|
|
const STARTTAG = 1; |
69
|
|
|
const ENDTAG = 2; |
70
|
|
|
const COMMENT = 3; |
71
|
|
|
const CHARACTER = 4; |
72
|
|
|
const SPACECHARACTER = 5; |
73
|
|
|
const EOF = 6; |
74
|
|
|
const PARSEERROR = 7; |
75
|
|
|
|
76
|
|
|
// These are constants representing bunches of characters. |
77
|
|
|
const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'; |
78
|
|
|
const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; |
79
|
|
|
const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz'; |
80
|
|
|
const DIGIT = '0123456789'; |
81
|
|
|
const HEX = '0123456789ABCDEFabcdef'; |
82
|
|
|
const WHITESPACE = "\t\n\x0c "; |
83
|
|
|
|
84
|
|
|
/** |
85
|
|
|
* @param $data Data to parse |
86
|
|
|
*/ |
87
|
|
|
public function __construct($data, $builder = null) { |
88
|
|
|
$this->stream = new HTML5_InputStream($data); |
89
|
|
|
if (!$builder) $this->tree = new HTML5_TreeBuilder; |
90
|
|
|
else $this->tree = $builder; |
91
|
|
|
$this->content_model = self::PCDATA; |
92
|
|
|
} |
93
|
|
|
|
94
|
|
|
public function parseFragment($context = null) { |
95
|
|
|
$this->tree->setupContext($context); |
96
|
|
|
if ($this->tree->content_model) { |
97
|
|
|
$this->content_model = $this->tree->content_model; |
98
|
|
|
$this->tree->content_model = null; |
99
|
|
|
} |
100
|
|
|
$this->parse(); |
101
|
|
|
} |
102
|
|
|
|
103
|
|
|
// XXX maybe convert this into an iterator? regardless, this function |
104
|
|
|
// and the save function should go into a Parser facade of some sort |
105
|
|
|
/** |
106
|
|
|
* Performs the actual parsing of the document. |
107
|
|
|
*/ |
108
|
|
|
public function parse() { |
109
|
|
|
// Current state |
110
|
|
|
$state = 'data'; |
111
|
|
|
// This is used to avoid having to have look-behind in the data state. |
112
|
|
|
$lastFourChars = ''; |
113
|
|
|
/** |
114
|
|
|
* Escape flag as specified by the HTML5 specification: "used to |
115
|
|
|
* control the behavior of the tokeniser. It is either true or |
116
|
|
|
* false, and initially must be set to the false state." |
117
|
|
|
*/ |
118
|
|
|
$escape = false; |
119
|
|
|
//echo "\n\n"; |
120
|
|
|
while($state !== null) { |
121
|
|
|
|
122
|
|
|
/*echo $state . ' '; |
123
|
|
|
switch ($this->content_model) { |
124
|
|
|
case self::PCDATA: echo 'PCDATA'; break; |
125
|
|
|
case self::RCDATA: echo 'RCDATA'; break; |
126
|
|
|
case self::CDATA: echo 'CDATA'; break; |
127
|
|
|
case self::PLAINTEXT: echo 'PLAINTEXT'; break; |
128
|
|
|
} |
129
|
|
|
if ($escape) echo " escape"; |
130
|
|
|
echo "\n";*/ |
131
|
|
|
|
132
|
|
|
switch($state) { |
133
|
|
|
case 'data': |
134
|
|
|
|
135
|
|
|
/* Consume the next input character */ |
136
|
|
|
$char = $this->stream->char(); |
137
|
|
|
$lastFourChars .= $char; |
138
|
|
|
if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4); |
139
|
|
|
|
140
|
|
|
// see below for meaning |
141
|
|
|
$hyp_cond = |
142
|
|
|
!$escape && |
143
|
|
|
( |
144
|
|
|
$this->content_model === self::RCDATA || |
145
|
|
|
$this->content_model === self::CDATA |
146
|
|
|
); |
147
|
|
|
$amp_cond = |
148
|
|
|
!$escape && |
149
|
|
|
( |
150
|
|
|
$this->content_model === self::PCDATA || |
151
|
|
|
$this->content_model === self::RCDATA |
152
|
|
|
); |
153
|
|
|
$lt_cond = |
154
|
|
|
$this->content_model === self::PCDATA || |
155
|
|
|
( |
156
|
|
|
( |
157
|
|
|
$this->content_model === self::RCDATA || |
158
|
|
|
$this->content_model === self::CDATA |
159
|
|
|
) && |
160
|
|
|
!$escape |
161
|
|
|
); |
162
|
|
|
$gt_cond = |
163
|
|
|
$escape && |
164
|
|
|
( |
165
|
|
|
$this->content_model === self::RCDATA || |
166
|
|
|
$this->content_model === self::CDATA |
167
|
|
|
); |
168
|
|
|
|
169
|
|
|
if($char === '&' && $amp_cond) { |
170
|
|
|
/* U+0026 AMPERSAND (&) |
171
|
|
|
When the content model flag is set to one of the PCDATA or RCDATA |
172
|
|
|
states and the escape flag is false: switch to the |
173
|
|
|
character reference data state. Otherwise: treat it as per |
174
|
|
|
the "anything else" entry below. */ |
175
|
|
|
$state = 'character reference data'; |
176
|
|
|
|
177
|
|
|
} elseif( |
178
|
|
|
$char === '-' && |
179
|
|
|
$hyp_cond && |
180
|
|
|
$lastFourChars === '<!--' |
181
|
|
|
) { |
182
|
|
|
/* |
183
|
|
|
U+002D HYPHEN-MINUS (-) |
184
|
|
|
If the content model flag is set to either the RCDATA state or |
185
|
|
|
the CDATA state, and the escape flag is false, and there are at |
186
|
|
|
least three characters before this one in the input stream, and the |
187
|
|
|
last four characters in the input stream, including this one, are |
188
|
|
|
U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS, |
189
|
|
|
and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */ |
190
|
|
|
$escape = true; |
191
|
|
|
|
192
|
|
|
/* In any case, emit the input character as a character token. Stay |
193
|
|
|
in the data state. */ |
194
|
|
|
$this->emitToken(array( |
195
|
|
|
'type' => self::CHARACTER, |
196
|
|
|
'data' => '-' |
197
|
|
|
)); |
198
|
|
|
// We do the "any case" part as part of "anything else". |
199
|
|
|
|
200
|
|
|
/* U+003C LESS-THAN SIGN (<) */ |
201
|
|
|
} elseif($char === '<' && $lt_cond) { |
202
|
|
|
/* When the content model flag is set to the PCDATA state: switch |
203
|
|
|
to the tag open state. |
204
|
|
|
|
205
|
|
|
When the content model flag is set to either the RCDATA state or |
206
|
|
|
the CDATA state and the escape flag is false: switch to the tag |
207
|
|
|
open state. |
208
|
|
|
|
209
|
|
|
Otherwise: treat it as per the "anything else" entry below. */ |
210
|
|
|
$state = 'tag open'; |
211
|
|
|
|
212
|
|
|
/* U+003E GREATER-THAN SIGN (>) */ |
213
|
|
|
} elseif( |
214
|
|
|
$char === '>' && |
215
|
|
|
$gt_cond && |
216
|
|
|
substr($lastFourChars, 1) === '-->' |
217
|
|
|
) { |
218
|
|
|
/* If the content model flag is set to either the RCDATA state or |
219
|
|
|
the CDATA state, and the escape flag is true, and the last three |
220
|
|
|
characters in the input stream including this one are U+002D |
221
|
|
|
HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"), |
222
|
|
|
set the escape flag to false. */ |
223
|
|
|
$escape = false; |
224
|
|
|
|
225
|
|
|
/* In any case, emit the input character as a character token. |
226
|
|
|
Stay in the data state. */ |
227
|
|
|
$this->emitToken(array( |
228
|
|
|
'type' => self::CHARACTER, |
229
|
|
|
'data' => '>' |
230
|
|
|
)); |
231
|
|
|
// We do the "any case" part as part of "anything else". |
232
|
|
|
|
233
|
|
|
} elseif($char === false) { |
234
|
|
|
/* EOF |
235
|
|
|
Emit an end-of-file token. */ |
236
|
|
|
$state = null; |
237
|
|
|
$this->tree->emitToken(array( |
238
|
|
|
'type' => self::EOF |
239
|
|
|
)); |
240
|
|
|
|
241
|
|
|
} elseif($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { |
242
|
|
|
// Directly after emitting a token you switch back to the "data |
243
|
|
|
// state". At that point spaceCharacters are important so they are |
244
|
|
|
// emitted separately. |
245
|
|
|
$chars = $this->stream->charsWhile(self::WHITESPACE); |
246
|
|
|
$this->emitToken(array( |
247
|
|
|
'type' => self::SPACECHARACTER, |
248
|
|
|
'data' => $char . $chars |
249
|
|
|
)); |
250
|
|
|
$lastFourChars .= $chars; |
251
|
|
|
if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4); |
252
|
|
|
|
253
|
|
|
} else { |
254
|
|
|
/* Anything else |
255
|
|
|
THIS IS AN OPTIMIZATION: Get as many character that |
256
|
|
|
otherwise would also be treated as a character token and emit it |
257
|
|
|
as a single character token. Stay in the data state. */ |
258
|
|
|
|
259
|
|
|
$mask = ''; |
260
|
|
|
if ($hyp_cond) $mask .= '-'; |
261
|
|
|
if ($amp_cond) $mask .= '&'; |
262
|
|
|
if ($lt_cond) $mask .= '<'; |
263
|
|
|
if ($gt_cond) $mask .= '>'; |
264
|
|
|
|
265
|
|
|
if ($mask === '') { |
266
|
|
|
$chars = $this->stream->remainingChars(); |
267
|
|
|
} else { |
268
|
|
|
$chars = $this->stream->charsUntil($mask); |
269
|
|
|
} |
270
|
|
|
|
271
|
|
|
$this->emitToken(array( |
272
|
|
|
'type' => self::CHARACTER, |
273
|
|
|
'data' => $char . $chars |
274
|
|
|
)); |
275
|
|
|
|
276
|
|
|
$lastFourChars .= $chars; |
277
|
|
|
if (strlen($lastFourChars) > 4) $lastFourChars = substr($lastFourChars, -4); |
278
|
|
|
|
279
|
|
|
$state = 'data'; |
280
|
|
|
} |
281
|
|
|
break; |
282
|
|
|
|
283
|
|
|
case 'character reference data': |
284
|
|
|
/* (This cannot happen if the content model flag |
285
|
|
|
is set to the CDATA state.) */ |
286
|
|
|
|
287
|
|
|
/* Attempt to consume a character reference, with no |
288
|
|
|
additional allowed character. */ |
289
|
|
|
$entity = $this->consumeCharacterReference(); |
290
|
|
|
|
291
|
|
|
/* If nothing is returned, emit a U+0026 AMPERSAND |
292
|
|
|
character token. Otherwise, emit the character token that |
293
|
|
|
was returned. */ |
294
|
|
|
// This is all done when consuming the character reference. |
295
|
|
|
$this->emitToken(array( |
296
|
|
|
'type' => self::CHARACTER, |
297
|
|
|
'data' => $entity |
298
|
|
|
)); |
299
|
|
|
|
300
|
|
|
/* Finally, switch to the data state. */ |
301
|
|
|
$state = 'data'; |
302
|
|
|
break; |
303
|
|
|
|
304
|
|
|
case 'tag open': |
305
|
|
|
$char = $this->stream->char(); |
306
|
|
|
|
307
|
|
|
switch($this->content_model) { |
308
|
|
|
case self::RCDATA: |
309
|
|
|
case self::CDATA: |
310
|
|
|
/* Consume the next input character. If it is a |
311
|
|
|
U+002F SOLIDUS (/) character, switch to the close |
312
|
|
|
tag open state. Otherwise, emit a U+003C LESS-THAN |
313
|
|
|
SIGN character token and reconsume the current input |
314
|
|
|
character in the data state. */ |
315
|
|
|
// We consumed above. |
316
|
|
|
|
317
|
|
|
if($char === '/') { |
318
|
|
|
$state = 'close tag open'; |
319
|
|
|
|
320
|
|
|
} else { |
321
|
|
|
$this->emitToken(array( |
322
|
|
|
'type' => self::CHARACTER, |
323
|
|
|
'data' => '<' |
324
|
|
|
)); |
325
|
|
|
|
326
|
|
|
$this->stream->unget(); |
327
|
|
|
|
328
|
|
|
$state = 'data'; |
329
|
|
|
} |
330
|
|
|
break; |
331
|
|
|
|
332
|
|
|
case self::PCDATA: |
333
|
|
|
/* If the content model flag is set to the PCDATA state |
334
|
|
|
Consume the next input character: */ |
335
|
|
|
// We consumed above. |
336
|
|
|
|
337
|
|
|
if($char === '!') { |
338
|
|
|
/* U+0021 EXCLAMATION MARK (!) |
339
|
|
|
Switch to the markup declaration open state. */ |
340
|
|
|
$state = 'markup declaration open'; |
341
|
|
|
|
342
|
|
|
} elseif($char === '/') { |
343
|
|
|
/* U+002F SOLIDUS (/) |
344
|
|
|
Switch to the close tag open state. */ |
345
|
|
|
$state = 'close tag open'; |
346
|
|
|
|
347
|
|
|
} elseif('A' <= $char && $char <= 'Z') { |
348
|
|
|
/* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z |
349
|
|
|
Create a new start tag token, set its tag name to the lowercase |
350
|
|
|
version of the input character (add 0x0020 to the character's code |
351
|
|
|
point), then switch to the tag name state. (Don't emit the token |
352
|
|
|
yet; further details will be filled in before it is emitted.) */ |
353
|
|
|
$this->token = array( |
354
|
|
|
'name' => strtolower($char), |
355
|
|
|
'type' => self::STARTTAG, |
356
|
|
|
'attr' => array() |
357
|
|
|
); |
358
|
|
|
|
359
|
|
|
$state = 'tag name'; |
360
|
|
|
|
361
|
|
|
} elseif('a' <= $char && $char <= 'z') { |
362
|
|
|
/* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z |
363
|
|
|
Create a new start tag token, set its tag name to the input |
364
|
|
|
character, then switch to the tag name state. (Don't emit |
365
|
|
|
the token yet; further details will be filled in before it |
366
|
|
|
is emitted.) */ |
367
|
|
|
$this->token = array( |
368
|
|
|
'name' => $char, |
369
|
|
|
'type' => self::STARTTAG, |
370
|
|
|
'attr' => array() |
371
|
|
|
); |
372
|
|
|
|
373
|
|
|
$state = 'tag name'; |
374
|
|
|
|
375
|
|
|
} elseif($char === '>') { |
376
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
377
|
|
|
Parse error. Emit a U+003C LESS-THAN SIGN character token and a |
378
|
|
|
U+003E GREATER-THAN SIGN character token. Switch to the data state. */ |
379
|
|
|
$this->emitToken(array( |
380
|
|
|
'type' => self::PARSEERROR, |
381
|
|
|
'data' => 'expected-tag-name-but-got-right-bracket' |
382
|
|
|
)); |
383
|
|
|
$this->emitToken(array( |
384
|
|
|
'type' => self::CHARACTER, |
385
|
|
|
'data' => '<>' |
386
|
|
|
)); |
387
|
|
|
|
388
|
|
|
$state = 'data'; |
389
|
|
|
|
390
|
|
|
} elseif($char === '?') { |
391
|
|
|
/* U+003F QUESTION MARK (?) |
392
|
|
|
Parse error. Switch to the bogus comment state. */ |
393
|
|
|
$this->emitToken(array( |
394
|
|
|
'type' => self::PARSEERROR, |
395
|
|
|
'data' => 'expected-tag-name-but-got-question-mark' |
396
|
|
|
)); |
397
|
|
|
$this->token = array( |
398
|
|
|
'data' => '?', |
399
|
|
|
'type' => self::COMMENT |
400
|
|
|
); |
401
|
|
|
$state = 'bogus comment'; |
402
|
|
|
|
403
|
|
|
} else { |
404
|
|
|
/* Anything else |
405
|
|
|
Parse error. Emit a U+003C LESS-THAN SIGN character token and |
406
|
|
|
reconsume the current input character in the data state. */ |
407
|
|
|
$this->emitToken(array( |
408
|
|
|
'type' => self::PARSEERROR, |
409
|
|
|
'data' => 'expected-tag-name' |
410
|
|
|
)); |
411
|
|
|
$this->emitToken(array( |
412
|
|
|
'type' => self::CHARACTER, |
413
|
|
|
'data' => '<' |
414
|
|
|
)); |
415
|
|
|
|
416
|
|
|
$state = 'data'; |
417
|
|
|
$this->stream->unget(); |
418
|
|
|
} |
419
|
|
|
break; |
420
|
|
|
} |
421
|
|
|
break; |
422
|
|
|
|
423
|
|
|
case 'close tag open': |
424
|
|
|
if ( |
425
|
|
|
$this->content_model === self::RCDATA || |
426
|
|
|
$this->content_model === self::CDATA |
427
|
|
|
) { |
428
|
|
|
/* If the content model flag is set to the RCDATA or CDATA |
429
|
|
|
states... */ |
430
|
|
|
$name = strtolower($this->stream->charsWhile(self::ALPHA)); |
431
|
|
|
$following = $this->stream->char(); |
432
|
|
|
$this->stream->unget(); |
433
|
|
|
if ( |
434
|
|
|
!$this->token || |
435
|
|
|
$this->token['name'] !== $name || |
436
|
|
|
$this->token['name'] === $name && !in_array($following, array("\x09", "\x0A", "\x0C", "\x20", "\x3E", "\x2F", false)) |
437
|
|
|
) { |
438
|
|
|
/* if no start tag token has ever been emitted by this instance |
439
|
|
|
of the tokenizer (fragment case), or, if the next few |
440
|
|
|
characters do not match the tag name of the last start tag |
441
|
|
|
token emitted (compared in an ASCII case-insensitive manner), |
442
|
|
|
or if they do but they are not immediately followed by one of |
443
|
|
|
the following characters: |
444
|
|
|
|
445
|
|
|
* U+0009 CHARACTER TABULATION |
446
|
|
|
* U+000A LINE FEED (LF) |
447
|
|
|
* U+000C FORM FEED (FF) |
448
|
|
|
* U+0020 SPACE |
449
|
|
|
* U+003E GREATER-THAN SIGN (>) |
450
|
|
|
* U+002F SOLIDUS (/) |
451
|
|
|
* EOF |
452
|
|
|
|
453
|
|
|
...then emit a U+003C LESS-THAN SIGN character token, a |
454
|
|
|
U+002F SOLIDUS character token, and switch to the data |
455
|
|
|
state to process the next input character. */ |
456
|
|
|
// XXX: Probably ought to replace in_array with $following === x ||... |
457
|
|
|
|
458
|
|
|
// We also need to emit $name now we've consumed that, as we |
459
|
|
|
// know it'll just be emitted as a character token. |
460
|
|
|
$this->emitToken(array( |
461
|
|
|
'type' => self::CHARACTER, |
462
|
|
|
'data' => '</' . $name |
463
|
|
|
)); |
464
|
|
|
|
465
|
|
|
$state = 'data'; |
466
|
|
|
} else { |
467
|
|
|
// This matches what would happen if we actually did the |
468
|
|
|
// otherwise below (but we can't because we've consumed too |
469
|
|
|
// much). |
470
|
|
|
|
471
|
|
|
// Start the end tag token with the name we already have. |
472
|
|
|
$this->token = array( |
473
|
|
|
'name' => $name, |
474
|
|
|
'type' => self::ENDTAG |
475
|
|
|
); |
476
|
|
|
|
477
|
|
|
// Change to tag name state. |
478
|
|
|
$state = 'tag name'; |
479
|
|
|
} |
480
|
|
|
} elseif ($this->content_model === self::PCDATA) { |
481
|
|
|
/* Otherwise, if the content model flag is set to the PCDATA |
482
|
|
|
state [...]: */ |
483
|
|
|
$char = $this->stream->char(); |
484
|
|
|
|
485
|
|
|
if ('A' <= $char && $char <= 'Z') { |
486
|
|
|
/* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z |
487
|
|
|
Create a new end tag token, set its tag name to the lowercase version |
488
|
|
|
of the input character (add 0x0020 to the character's code point), then |
489
|
|
|
switch to the tag name state. (Don't emit the token yet; further details |
490
|
|
|
will be filled in before it is emitted.) */ |
491
|
|
|
$this->token = array( |
492
|
|
|
'name' => strtolower($char), |
493
|
|
|
'type' => self::ENDTAG |
494
|
|
|
); |
495
|
|
|
|
496
|
|
|
$state = 'tag name'; |
497
|
|
|
|
498
|
|
|
} elseif ('a' <= $char && $char <= 'z') { |
499
|
|
|
/* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z |
500
|
|
|
Create a new end tag token, set its tag name to the |
501
|
|
|
input character, then switch to the tag name state. |
502
|
|
|
(Don't emit the token yet; further details will be |
503
|
|
|
filled in before it is emitted.) */ |
504
|
|
|
$this->token = array( |
505
|
|
|
'name' => $char, |
506
|
|
|
'type' => self::ENDTAG |
507
|
|
|
); |
508
|
|
|
|
509
|
|
|
$state = 'tag name'; |
510
|
|
|
|
511
|
|
|
} elseif($char === '>') { |
512
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
513
|
|
|
Parse error. Switch to the data state. */ |
514
|
|
|
$this->emitToken(array( |
515
|
|
|
'type' => self::PARSEERROR, |
516
|
|
|
'data' => 'expected-closing-tag-but-got-right-bracket' |
517
|
|
|
)); |
518
|
|
|
$state = 'data'; |
519
|
|
|
|
520
|
|
|
} elseif($char === false) { |
521
|
|
|
/* EOF |
522
|
|
|
Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F |
523
|
|
|
SOLIDUS character token. Reconsume the EOF character in the data state. */ |
524
|
|
|
$this->emitToken(array( |
525
|
|
|
'type' => self::PARSEERROR, |
526
|
|
|
'data' => 'expected-closing-tag-but-got-eof' |
527
|
|
|
)); |
528
|
|
|
$this->emitToken(array( |
529
|
|
|
'type' => self::CHARACTER, |
530
|
|
|
'data' => '</' |
531
|
|
|
)); |
532
|
|
|
|
533
|
|
|
$this->stream->unget(); |
534
|
|
|
$state = 'data'; |
535
|
|
|
|
536
|
|
|
} else { |
537
|
|
|
/* Parse error. Switch to the bogus comment state. */ |
538
|
|
|
$this->emitToken(array( |
539
|
|
|
'type' => self::PARSEERROR, |
540
|
|
|
'data' => 'expected-closing-tag-but-got-char' |
541
|
|
|
)); |
542
|
|
|
$this->token = array( |
543
|
|
|
'data' => $char, |
544
|
|
|
'type' => self::COMMENT |
545
|
|
|
); |
546
|
|
|
$state = 'bogus comment'; |
547
|
|
|
} |
548
|
|
|
} |
549
|
|
|
break; |
550
|
|
|
|
551
|
|
|
case 'tag name': |
552
|
|
|
/* Consume the next input character: */ |
553
|
|
|
$char = $this->stream->char(); |
554
|
|
|
|
555
|
|
|
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { |
556
|
|
|
/* U+0009 CHARACTER TABULATION |
557
|
|
|
U+000A LINE FEED (LF) |
558
|
|
|
U+000C FORM FEED (FF) |
559
|
|
|
U+0020 SPACE |
560
|
|
|
Switch to the before attribute name state. */ |
561
|
|
|
$state = 'before attribute name'; |
562
|
|
|
|
563
|
|
|
} elseif($char === '/') { |
564
|
|
|
/* U+002F SOLIDUS (/) |
565
|
|
|
Switch to the self-closing start tag state. */ |
566
|
|
|
$state = 'self-closing start tag'; |
567
|
|
|
|
568
|
|
|
} elseif($char === '>') { |
569
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
570
|
|
|
Emit the current tag token. Switch to the data state. */ |
571
|
|
|
$this->emitToken($this->token); |
572
|
|
|
$state = 'data'; |
573
|
|
|
|
574
|
|
|
} elseif('A' <= $char && $char <= 'Z') { |
575
|
|
|
/* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z |
576
|
|
|
Append the lowercase version of the current input |
577
|
|
|
character (add 0x0020 to the character's code point) to |
578
|
|
|
the current tag token's tag name. Stay in the tag name state. */ |
579
|
|
|
$chars = $this->stream->charsWhile(self::UPPER_ALPHA); |
580
|
|
|
|
581
|
|
|
$this->token['name'] .= strtolower($char . $chars); |
582
|
|
|
$state = 'tag name'; |
583
|
|
|
|
584
|
|
|
} elseif($char === false) { |
585
|
|
|
/* EOF |
586
|
|
|
Parse error. Reconsume the EOF character in the data state. */ |
587
|
|
|
$this->emitToken(array( |
588
|
|
|
'type' => self::PARSEERROR, |
589
|
|
|
'data' => 'eof-in-tag-name' |
590
|
|
|
)); |
591
|
|
|
|
592
|
|
|
$this->stream->unget(); |
593
|
|
|
$state = 'data'; |
594
|
|
|
|
595
|
|
|
} else { |
596
|
|
|
/* Anything else |
597
|
|
|
Append the current input character to the current tag token's tag name. |
598
|
|
|
Stay in the tag name state. */ |
599
|
|
|
$chars = $this->stream->charsUntil("\t\n\x0C />" . self::UPPER_ALPHA); |
600
|
|
|
|
601
|
|
|
$this->token['name'] .= $char . $chars; |
602
|
|
|
$state = 'tag name'; |
603
|
|
|
} |
604
|
|
|
break; |
605
|
|
|
|
606
|
|
|
case 'before attribute name': |
607
|
|
|
/* Consume the next input character: */ |
608
|
|
|
$char = $this->stream->char(); |
609
|
|
|
|
610
|
|
|
// this conditional is optimized, check bottom |
611
|
|
|
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { |
612
|
|
|
/* U+0009 CHARACTER TABULATION |
613
|
|
|
U+000A LINE FEED (LF) |
614
|
|
|
U+000C FORM FEED (FF) |
615
|
|
|
U+0020 SPACE |
616
|
|
|
Stay in the before attribute name state. */ |
617
|
|
|
$state = 'before attribute name'; |
618
|
|
|
|
619
|
|
|
} elseif($char === '/') { |
620
|
|
|
/* U+002F SOLIDUS (/) |
621
|
|
|
Switch to the self-closing start tag state. */ |
622
|
|
|
$state = 'self-closing start tag'; |
623
|
|
|
|
624
|
|
|
} elseif($char === '>') { |
625
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
626
|
|
|
Emit the current tag token. Switch to the data state. */ |
627
|
|
|
$this->emitToken($this->token); |
628
|
|
|
$state = 'data'; |
629
|
|
|
|
630
|
|
|
} elseif('A' <= $char && $char <= 'Z') { |
631
|
|
|
/* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z |
632
|
|
|
Start a new attribute in the current tag token. Set that |
633
|
|
|
attribute's name to the lowercase version of the current |
634
|
|
|
input character (add 0x0020 to the character's code |
635
|
|
|
point), and its value to the empty string. Switch to the |
636
|
|
|
attribute name state.*/ |
637
|
|
|
$this->token['attr'][] = array( |
638
|
|
|
'name' => strtolower($char), |
639
|
|
|
'value' => '' |
640
|
|
|
); |
641
|
|
|
|
642
|
|
|
$state = 'attribute name'; |
643
|
|
|
|
644
|
|
|
} elseif($char === false) { |
645
|
|
|
/* EOF |
646
|
|
|
Parse error. Reconsume the EOF character in the data state. */ |
647
|
|
|
$this->emitToken(array( |
648
|
|
|
'type' => self::PARSEERROR, |
649
|
|
|
'data' => 'expected-attribute-name-but-got-eof' |
650
|
|
|
)); |
651
|
|
|
|
652
|
|
|
$this->stream->unget(); |
653
|
|
|
$state = 'data'; |
654
|
|
|
|
655
|
|
|
} else { |
656
|
|
|
/* U+0022 QUOTATION MARK (") |
657
|
|
|
U+0027 APOSTROPHE (') |
658
|
|
|
U+003C LESS-THAN SIGN (<) |
659
|
|
|
U+003D EQUALS SIGN (=) |
660
|
|
|
Parse error. Treat it as per the "anything else" entry |
661
|
|
|
below. */ |
662
|
|
|
if($char === '"' || $char === "'" || $char === '<' || $char === '=') { |
663
|
|
|
$this->emitToken(array( |
664
|
|
|
'type' => self::PARSEERROR, |
665
|
|
|
'data' => 'invalid-character-in-attribute-name' |
666
|
|
|
)); |
667
|
|
|
} |
668
|
|
|
|
669
|
|
|
/* Anything else |
670
|
|
|
Start a new attribute in the current tag token. Set that attribute's |
671
|
|
|
name to the current input character, and its value to the empty string. |
672
|
|
|
Switch to the attribute name state. */ |
673
|
|
|
$this->token['attr'][] = array( |
674
|
|
|
'name' => $char, |
675
|
|
|
'value' => '' |
676
|
|
|
); |
677
|
|
|
|
678
|
|
|
$state = 'attribute name'; |
679
|
|
|
} |
680
|
|
|
break; |
681
|
|
|
|
682
|
|
|
case 'attribute name': |
683
|
|
|
// Consume the next input character: |
684
|
|
|
$char = $this->stream->char(); |
685
|
|
|
|
686
|
|
|
// this conditional is optimized, check bottom |
687
|
|
|
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { |
688
|
|
|
/* U+0009 CHARACTER TABULATION |
689
|
|
|
U+000A LINE FEED (LF) |
690
|
|
|
U+000C FORM FEED (FF) |
691
|
|
|
U+0020 SPACE |
692
|
|
|
Switch to the after attribute name state. */ |
693
|
|
|
$state = 'after attribute name'; |
694
|
|
|
|
695
|
|
|
} elseif($char === '/') { |
696
|
|
|
/* U+002F SOLIDUS (/) |
697
|
|
|
Switch to the self-closing start tag state. */ |
698
|
|
|
$state = 'self-closing start tag'; |
699
|
|
|
|
700
|
|
|
} elseif($char === '=') { |
701
|
|
|
/* U+003D EQUALS SIGN (=) |
702
|
|
|
Switch to the before attribute value state. */ |
703
|
|
|
$state = 'before attribute value'; |
704
|
|
|
|
705
|
|
|
} elseif($char === '>') { |
706
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
707
|
|
|
Emit the current tag token. Switch to the data state. */ |
708
|
|
|
$this->emitToken($this->token); |
709
|
|
|
$state = 'data'; |
710
|
|
|
|
711
|
|
|
} elseif('A' <= $char && $char <= 'Z') { |
712
|
|
|
/* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z |
713
|
|
|
Append the lowercase version of the current input |
714
|
|
|
character (add 0x0020 to the character's code point) to |
715
|
|
|
the current attribute's name. Stay in the attribute name |
716
|
|
|
state. */ |
717
|
|
|
$chars = $this->stream->charsWhile(self::UPPER_ALPHA); |
718
|
|
|
|
719
|
|
|
$last = count($this->token['attr']) - 1; |
720
|
|
|
$this->token['attr'][$last]['name'] .= strtolower($char . $chars); |
721
|
|
|
|
722
|
|
|
$state = 'attribute name'; |
723
|
|
|
|
724
|
|
|
} elseif($char === false) { |
725
|
|
|
/* EOF |
726
|
|
|
Parse error. Reconsume the EOF character in the data state. */ |
727
|
|
|
$this->emitToken(array( |
728
|
|
|
'type' => self::PARSEERROR, |
729
|
|
|
'data' => 'eof-in-attribute-name' |
730
|
|
|
)); |
731
|
|
|
|
732
|
|
|
$this->stream->unget(); |
733
|
|
|
$state = 'data'; |
734
|
|
|
|
735
|
|
|
} else { |
736
|
|
|
/* U+0022 QUOTATION MARK (") |
737
|
|
|
U+0027 APOSTROPHE (') |
738
|
|
|
U+003C LESS-THAN SIGN (<) |
739
|
|
|
Parse error. Treat it as per the "anything else" |
740
|
|
|
entry below. */ |
741
|
|
|
if($char === '"' || $char === "'" || $char === '<') { |
742
|
|
|
$this->emitToken(array( |
743
|
|
|
'type' => self::PARSEERROR, |
744
|
|
|
'data' => 'invalid-character-in-attribute-name' |
745
|
|
|
)); |
746
|
|
|
} |
747
|
|
|
|
748
|
|
|
/* Anything else |
749
|
|
|
Append the current input character to the current attribute's name. |
750
|
|
|
Stay in the attribute name state. */ |
751
|
|
|
$chars = $this->stream->charsUntil("\t\n\x0C /=>\"'" . self::UPPER_ALPHA); |
752
|
|
|
|
753
|
|
|
$last = count($this->token['attr']) - 1; |
754
|
|
|
$this->token['attr'][$last]['name'] .= $char . $chars; |
755
|
|
|
|
756
|
|
|
$state = 'attribute name'; |
757
|
|
|
} |
758
|
|
|
|
759
|
|
|
/* When the user agent leaves the attribute name state |
760
|
|
|
(and before emitting the tag token, if appropriate), the |
761
|
|
|
complete attribute's name must be compared to the other |
762
|
|
|
attributes on the same token; if there is already an |
763
|
|
|
attribute on the token with the exact same name, then this |
764
|
|
|
is a parse error and the new attribute must be dropped, along |
765
|
|
|
with the value that gets associated with it (if any). */ |
766
|
|
|
// this might be implemented in the emitToken method |
767
|
|
|
break; |
768
|
|
|
|
769
|
|
|
case 'after attribute name': |
770
|
|
|
// Consume the next input character: |
771
|
|
|
$char = $this->stream->char(); |
772
|
|
|
|
773
|
|
|
// this is an optimized conditional, check the bottom |
774
|
|
|
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { |
775
|
|
|
/* U+0009 CHARACTER TABULATION |
776
|
|
|
U+000A LINE FEED (LF) |
777
|
|
|
U+000C FORM FEED (FF) |
778
|
|
|
U+0020 SPACE |
779
|
|
|
Stay in the after attribute name state. */ |
780
|
|
|
$state = 'after attribute name'; |
781
|
|
|
|
782
|
|
|
} elseif($char === '/') { |
783
|
|
|
/* U+002F SOLIDUS (/) |
784
|
|
|
Switch to the self-closing start tag state. */ |
785
|
|
|
$state = 'self-closing start tag'; |
786
|
|
|
|
787
|
|
|
} elseif($char === '=') { |
788
|
|
|
/* U+003D EQUALS SIGN (=) |
789
|
|
|
Switch to the before attribute value state. */ |
790
|
|
|
$state = 'before attribute value'; |
791
|
|
|
|
792
|
|
|
} elseif($char === '>') { |
793
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
794
|
|
|
Emit the current tag token. Switch to the data state. */ |
795
|
|
|
$this->emitToken($this->token); |
796
|
|
|
$state = 'data'; |
797
|
|
|
|
798
|
|
|
} elseif('A' <= $char && $char <= 'Z') { |
799
|
|
|
/* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z |
800
|
|
|
Start a new attribute in the current tag token. Set that |
801
|
|
|
attribute's name to the lowercase version of the current |
802
|
|
|
input character (add 0x0020 to the character's code |
803
|
|
|
point), and its value to the empty string. Switch to the |
804
|
|
|
attribute name state. */ |
805
|
|
|
$this->token['attr'][] = array( |
806
|
|
|
'name' => strtolower($char), |
807
|
|
|
'value' => '' |
808
|
|
|
); |
809
|
|
|
|
810
|
|
|
$state = 'attribute name'; |
811
|
|
|
|
812
|
|
|
} elseif($char === false) { |
813
|
|
|
/* EOF |
814
|
|
|
Parse error. Reconsume the EOF character in the data state. */ |
815
|
|
|
$this->emitToken(array( |
816
|
|
|
'type' => self::PARSEERROR, |
817
|
|
|
'data' => 'expected-end-of-tag-but-got-eof' |
818
|
|
|
)); |
819
|
|
|
|
820
|
|
|
$this->stream->unget(); |
821
|
|
|
$state = 'data'; |
822
|
|
|
|
823
|
|
|
} else { |
824
|
|
|
/* U+0022 QUOTATION MARK (") |
825
|
|
|
U+0027 APOSTROPHE (') |
826
|
|
|
U+003C LESS-THAN SIGN(<) |
827
|
|
|
Parse error. Treat it as per the "anything else" |
828
|
|
|
entry below. */ |
829
|
|
|
if($char === '"' || $char === "'" || $char === "<") { |
830
|
|
|
$this->emitToken(array( |
831
|
|
|
'type' => self::PARSEERROR, |
832
|
|
|
'data' => 'invalid-character-after-attribute-name' |
833
|
|
|
)); |
834
|
|
|
} |
835
|
|
|
|
836
|
|
|
/* Anything else |
837
|
|
|
Start a new attribute in the current tag token. Set that attribute's |
838
|
|
|
name to the current input character, and its value to the empty string. |
839
|
|
|
Switch to the attribute name state. */ |
840
|
|
|
$this->token['attr'][] = array( |
841
|
|
|
'name' => $char, |
842
|
|
|
'value' => '' |
843
|
|
|
); |
844
|
|
|
|
845
|
|
|
$state = 'attribute name'; |
846
|
|
|
} |
847
|
|
|
break; |
848
|
|
|
|
849
|
|
|
case 'before attribute value': |
850
|
|
|
// Consume the next input character: |
851
|
|
|
$char = $this->stream->char(); |
852
|
|
|
|
853
|
|
|
// this is an optimized conditional |
854
|
|
|
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { |
855
|
|
|
/* U+0009 CHARACTER TABULATION |
856
|
|
|
U+000A LINE FEED (LF) |
857
|
|
|
U+000C FORM FEED (FF) |
858
|
|
|
U+0020 SPACE |
859
|
|
|
Stay in the before attribute value state. */ |
860
|
|
|
$state = 'before attribute value'; |
861
|
|
|
|
862
|
|
|
} elseif($char === '"') { |
863
|
|
|
/* U+0022 QUOTATION MARK (") |
864
|
|
|
Switch to the attribute value (double-quoted) state. */ |
865
|
|
|
$state = 'attribute value (double-quoted)'; |
866
|
|
|
|
867
|
|
|
} elseif($char === '&') { |
868
|
|
|
/* U+0026 AMPERSAND (&) |
869
|
|
|
Switch to the attribute value (unquoted) state and reconsume |
870
|
|
|
this input character. */ |
871
|
|
|
$this->stream->unget(); |
872
|
|
|
$state = 'attribute value (unquoted)'; |
873
|
|
|
|
874
|
|
|
} elseif($char === '\'') { |
875
|
|
|
/* U+0027 APOSTROPHE (') |
876
|
|
|
Switch to the attribute value (single-quoted) state. */ |
877
|
|
|
$state = 'attribute value (single-quoted)'; |
878
|
|
|
|
879
|
|
|
} elseif($char === '>') { |
880
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
881
|
|
|
Parse error. Emit the current tag token. Switch to the data state. */ |
882
|
|
|
$this->emitToken(array( |
883
|
|
|
'type' => self::PARSEERROR, |
884
|
|
|
'data' => 'expected-attribute-value-but-got-right-bracket' |
885
|
|
|
)); |
886
|
|
|
$this->emitToken($this->token); |
887
|
|
|
$state = 'data'; |
888
|
|
|
|
889
|
|
|
} elseif($char === false) { |
890
|
|
|
/* EOF |
891
|
|
|
Parse error. Reconsume the EOF character in the data state. */ |
892
|
|
|
$this->emitToken(array( |
893
|
|
|
'type' => self::PARSEERROR, |
894
|
|
|
'data' => 'expected-attribute-value-but-got-eof' |
895
|
|
|
)); |
896
|
|
|
$this->stream->unget(); |
897
|
|
|
$state = 'data'; |
898
|
|
|
|
899
|
|
|
} else { |
900
|
|
|
/* U+003D EQUALS SIGN (=) |
901
|
|
|
* U+003C LESS-THAN SIGN (<) |
902
|
|
|
Parse error. Treat it as per the "anything else" entry below. */ |
903
|
|
|
if($char === '=' || $char === '<') { |
904
|
|
|
$this->emitToken(array( |
905
|
|
|
'type' => self::PARSEERROR, |
906
|
|
|
'data' => 'equals-in-unquoted-attribute-value' |
907
|
|
|
)); |
908
|
|
|
} |
909
|
|
|
|
910
|
|
|
/* Anything else |
911
|
|
|
Append the current input character to the current attribute's value. |
912
|
|
|
Switch to the attribute value (unquoted) state. */ |
913
|
|
|
$last = count($this->token['attr']) - 1; |
914
|
|
|
$this->token['attr'][$last]['value'] .= $char; |
915
|
|
|
|
916
|
|
|
$state = 'attribute value (unquoted)'; |
917
|
|
|
} |
918
|
|
|
break; |
919
|
|
|
|
920
|
|
|
case 'attribute value (double-quoted)': |
921
|
|
|
// Consume the next input character: |
922
|
|
|
$char = $this->stream->char(); |
923
|
|
|
|
924
|
|
|
if($char === '"') { |
925
|
|
|
/* U+0022 QUOTATION MARK (") |
926
|
|
|
Switch to the after attribute value (quoted) state. */ |
927
|
|
|
$state = 'after attribute value (quoted)'; |
928
|
|
|
|
929
|
|
|
} elseif($char === '&') { |
930
|
|
|
/* U+0026 AMPERSAND (&) |
931
|
|
|
Switch to the character reference in attribute value |
932
|
|
|
state, with the additional allowed character |
933
|
|
|
being U+0022 QUOTATION MARK ("). */ |
934
|
|
|
$this->characterReferenceInAttributeValue('"'); |
935
|
|
|
|
936
|
|
|
} elseif($char === false) { |
937
|
|
|
/* EOF |
938
|
|
|
Parse error. Reconsume the EOF character in the data state. */ |
939
|
|
|
$this->emitToken(array( |
940
|
|
|
'type' => self::PARSEERROR, |
941
|
|
|
'data' => 'eof-in-attribute-value-double-quote' |
942
|
|
|
)); |
943
|
|
|
|
944
|
|
|
$this->stream->unget(); |
945
|
|
|
$state = 'data'; |
946
|
|
|
|
947
|
|
|
} else { |
948
|
|
|
/* Anything else |
949
|
|
|
Append the current input character to the current attribute's value. |
950
|
|
|
Stay in the attribute value (double-quoted) state. */ |
951
|
|
|
$chars = $this->stream->charsUntil('"&'); |
952
|
|
|
|
953
|
|
|
$last = count($this->token['attr']) - 1; |
954
|
|
|
$this->token['attr'][$last]['value'] .= $char . $chars; |
955
|
|
|
|
956
|
|
|
$state = 'attribute value (double-quoted)'; |
957
|
|
|
} |
958
|
|
|
break; |
959
|
|
|
|
960
|
|
|
case 'attribute value (single-quoted)': |
961
|
|
|
// Consume the next input character: |
962
|
|
|
$char = $this->stream->char(); |
963
|
|
|
|
964
|
|
|
if($char === "'") { |
965
|
|
|
/* U+0022 QUOTATION MARK (') |
966
|
|
|
Switch to the after attribute value state. */ |
967
|
|
|
$state = 'after attribute value (quoted)'; |
968
|
|
|
|
969
|
|
|
} elseif($char === '&') { |
970
|
|
|
/* U+0026 AMPERSAND (&) |
971
|
|
|
Switch to the entity in attribute value state. */ |
972
|
|
|
$this->characterReferenceInAttributeValue("'"); |
973
|
|
|
|
974
|
|
|
} elseif($char === false) { |
975
|
|
|
/* EOF |
976
|
|
|
Parse error. Reconsume the EOF character in the data state. */ |
977
|
|
|
$this->emitToken(array( |
978
|
|
|
'type' => self::PARSEERROR, |
979
|
|
|
'data' => 'eof-in-attribute-value-single-quote' |
980
|
|
|
)); |
981
|
|
|
|
982
|
|
|
$this->stream->unget(); |
983
|
|
|
$state = 'data'; |
984
|
|
|
|
985
|
|
|
} else { |
986
|
|
|
/* Anything else |
987
|
|
|
Append the current input character to the current attribute's value. |
988
|
|
|
Stay in the attribute value (single-quoted) state. */ |
989
|
|
|
$chars = $this->stream->charsUntil("'&"); |
990
|
|
|
|
991
|
|
|
$last = count($this->token['attr']) - 1; |
992
|
|
|
$this->token['attr'][$last]['value'] .= $char . $chars; |
993
|
|
|
|
994
|
|
|
$state = 'attribute value (single-quoted)'; |
995
|
|
|
} |
996
|
|
|
break; |
997
|
|
|
|
998
|
|
|
case 'attribute value (unquoted)': |
999
|
|
|
// Consume the next input character: |
1000
|
|
|
$char = $this->stream->char(); |
1001
|
|
|
|
1002
|
|
|
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { |
1003
|
|
|
/* U+0009 CHARACTER TABULATION |
1004
|
|
|
U+000A LINE FEED (LF) |
1005
|
|
|
U+000C FORM FEED (FF) |
1006
|
|
|
U+0020 SPACE |
1007
|
|
|
Switch to the before attribute name state. */ |
1008
|
|
|
$state = 'before attribute name'; |
1009
|
|
|
|
1010
|
|
|
} elseif($char === '&') { |
1011
|
|
|
/* U+0026 AMPERSAND (&) |
1012
|
|
|
Switch to the entity in attribute value state, with the |
1013
|
|
|
additional allowed character being U+003E |
1014
|
|
|
GREATER-THAN SIGN (>). */ |
1015
|
|
|
$this->characterReferenceInAttributeValue('>'); |
1016
|
|
|
|
1017
|
|
|
} elseif($char === '>') { |
1018
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
1019
|
|
|
Emit the current tag token. Switch to the data state. */ |
1020
|
|
|
$this->emitToken($this->token); |
1021
|
|
|
$state = 'data'; |
1022
|
|
|
|
1023
|
|
|
} elseif ($char === false) { |
1024
|
|
|
/* EOF |
1025
|
|
|
Parse error. Reconsume the EOF character in the data state. */ |
1026
|
|
|
$this->emitToken(array( |
1027
|
|
|
'type' => self::PARSEERROR, |
1028
|
|
|
'data' => 'eof-in-attribute-value-no-quotes' |
1029
|
|
|
)); |
1030
|
|
|
$this->stream->unget(); |
1031
|
|
|
$state = 'data'; |
1032
|
|
|
|
1033
|
|
|
} else { |
1034
|
|
|
/* U+0022 QUOTATION MARK (") |
1035
|
|
|
U+0027 APOSTROPHE (') |
1036
|
|
|
U+003C LESS-THAN SIGN (<) |
1037
|
|
|
U+003D EQUALS SIGN (=) |
1038
|
|
|
Parse error. Treat it as per the "anything else" |
1039
|
|
|
entry below. */ |
1040
|
|
|
if($char === '"' || $char === "'" || $char === '=' || $char == '<') { |
1041
|
|
|
$this->emitToken(array( |
1042
|
|
|
'type' => self::PARSEERROR, |
1043
|
|
|
'data' => 'unexpected-character-in-unquoted-attribute-value' |
1044
|
|
|
)); |
1045
|
|
|
} |
1046
|
|
|
|
1047
|
|
|
/* Anything else |
1048
|
|
|
Append the current input character to the current attribute's value. |
1049
|
|
|
Stay in the attribute value (unquoted) state. */ |
1050
|
|
|
$chars = $this->stream->charsUntil("\t\n\x0c &>\"'="); |
1051
|
|
|
|
1052
|
|
|
$last = count($this->token['attr']) - 1; |
1053
|
|
|
$this->token['attr'][$last]['value'] .= $char . $chars; |
1054
|
|
|
|
1055
|
|
|
$state = 'attribute value (unquoted)'; |
1056
|
|
|
} |
1057
|
|
|
break; |
1058
|
|
|
|
1059
|
|
|
case 'after attribute value (quoted)': |
1060
|
|
|
/* Consume the next input character: */ |
1061
|
|
|
$char = $this->stream->char(); |
1062
|
|
|
|
1063
|
|
|
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { |
1064
|
|
|
/* U+0009 CHARACTER TABULATION |
1065
|
|
|
U+000A LINE FEED (LF) |
1066
|
|
|
U+000C FORM FEED (FF) |
1067
|
|
|
U+0020 SPACE |
1068
|
|
|
Switch to the before attribute name state. */ |
1069
|
|
|
$state = 'before attribute name'; |
1070
|
|
|
|
1071
|
|
|
} elseif ($char === '/') { |
1072
|
|
|
/* U+002F SOLIDUS (/) |
1073
|
|
|
Switch to the self-closing start tag state. */ |
1074
|
|
|
$state = 'self-closing start tag'; |
1075
|
|
|
|
1076
|
|
|
} elseif ($char === '>') { |
1077
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
1078
|
|
|
Emit the current tag token. Switch to the data state. */ |
1079
|
|
|
$this->emitToken($this->token); |
1080
|
|
|
$state = 'data'; |
1081
|
|
|
|
1082
|
|
|
} elseif ($char === false) { |
1083
|
|
|
/* EOF |
1084
|
|
|
Parse error. Reconsume the EOF character in the data state. */ |
1085
|
|
|
$this->emitToken(array( |
1086
|
|
|
'type' => self::PARSEERROR, |
1087
|
|
|
'data' => 'unexpected-EOF-after-attribute-value' |
1088
|
|
|
)); |
1089
|
|
|
$this->stream->unget(); |
1090
|
|
|
$state = 'data'; |
1091
|
|
|
|
1092
|
|
|
} else { |
1093
|
|
|
/* Anything else |
1094
|
|
|
Parse error. Reconsume the character in the before attribute |
1095
|
|
|
name state. */ |
1096
|
|
|
$this->emitToken(array( |
1097
|
|
|
'type' => self::PARSEERROR, |
1098
|
|
|
'data' => 'unexpected-character-after-attribute-value' |
1099
|
|
|
)); |
1100
|
|
|
$this->stream->unget(); |
1101
|
|
|
$state = 'before attribute name'; |
1102
|
|
|
} |
1103
|
|
|
break; |
1104
|
|
|
|
1105
|
|
|
case 'self-closing start tag': |
1106
|
|
|
/* Consume the next input character: */ |
1107
|
|
|
$char = $this->stream->char(); |
1108
|
|
|
|
1109
|
|
|
if ($char === '>') { |
1110
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
1111
|
|
|
Set the self-closing flag of the current tag token. |
1112
|
|
|
Emit the current tag token. Switch to the data state. */ |
1113
|
|
|
// not sure if this is the name we want |
1114
|
|
|
$this->token['self-closing'] = true; |
1115
|
|
|
$this->emitToken($this->token); |
1116
|
|
|
$state = 'data'; |
1117
|
|
|
|
1118
|
|
|
} elseif ($char === false) { |
1119
|
|
|
/* EOF |
1120
|
|
|
Parse error. Reconsume the EOF character in the data state. */ |
1121
|
|
|
$this->emitToken(array( |
1122
|
|
|
'type' => self::PARSEERROR, |
1123
|
|
|
'data' => 'unexpected-eof-after-self-closing' |
1124
|
|
|
)); |
1125
|
|
|
$this->stream->unget(); |
1126
|
|
|
$state = 'data'; |
1127
|
|
|
|
1128
|
|
|
} else { |
1129
|
|
|
/* Anything else |
1130
|
|
|
Parse error. Reconsume the character in the before attribute name state. */ |
1131
|
|
|
$this->emitToken(array( |
1132
|
|
|
'type' => self::PARSEERROR, |
1133
|
|
|
'data' => 'unexpected-character-after-self-closing' |
1134
|
|
|
)); |
1135
|
|
|
$this->stream->unget(); |
1136
|
|
|
$state = 'before attribute name'; |
1137
|
|
|
} |
1138
|
|
|
break; |
1139
|
|
|
|
1140
|
|
|
case 'bogus comment': |
1141
|
|
|
/* (This can only happen if the content model flag is set to the PCDATA state.) */ |
1142
|
|
|
/* Consume every character up to the first U+003E GREATER-THAN SIGN |
1143
|
|
|
character (>) or the end of the file (EOF), whichever comes first. Emit |
1144
|
|
|
a comment token whose data is the concatenation of all the characters |
1145
|
|
|
starting from and including the character that caused the state machine |
1146
|
|
|
to switch into the bogus comment state, up to and including the last |
1147
|
|
|
consumed character before the U+003E character, if any, or up to the |
1148
|
|
|
end of the file otherwise. (If the comment was started by the end of |
1149
|
|
|
the file (EOF), the token is empty.) */ |
1150
|
|
|
$this->token['data'] .= (string) $this->stream->charsUntil('>'); |
1151
|
|
|
$this->stream->char(); |
1152
|
|
|
|
1153
|
|
|
$this->emitToken($this->token); |
1154
|
|
|
|
1155
|
|
|
/* Switch to the data state. */ |
1156
|
|
|
$state = 'data'; |
1157
|
|
|
break; |
1158
|
|
|
|
1159
|
|
|
case 'markup declaration open': |
1160
|
|
|
// Consume for below |
1161
|
|
|
$hyphens = $this->stream->charsWhile('-', 2); |
1162
|
|
|
if ($hyphens === '-') { |
1163
|
|
|
$this->stream->unget(); |
1164
|
|
|
} |
1165
|
|
|
if ($hyphens !== '--') { |
1166
|
|
|
$alpha = $this->stream->charsWhile(self::ALPHA, 7); |
1167
|
|
|
} |
1168
|
|
|
|
1169
|
|
|
/* If the next two characters are both U+002D HYPHEN-MINUS (-) |
1170
|
|
|
characters, consume those two characters, create a comment token whose |
1171
|
|
|
data is the empty string, and switch to the comment state. */ |
1172
|
|
|
if($hyphens === '--') { |
1173
|
|
|
$state = 'comment start'; |
1174
|
|
|
$this->token = array( |
1175
|
|
|
'data' => '', |
1176
|
|
|
'type' => self::COMMENT |
1177
|
|
|
); |
1178
|
|
|
|
1179
|
|
|
/* Otherwise if the next seven characters are a case-insensitive match |
1180
|
|
|
for the word "DOCTYPE", then consume those characters and switch to the |
1181
|
|
|
DOCTYPE state. */ |
1182
|
|
|
} elseif(strtoupper($alpha) === 'DOCTYPE') { |
1183
|
|
|
$state = 'DOCTYPE'; |
1184
|
|
|
|
1185
|
|
|
// XXX not implemented |
1186
|
|
|
/* Otherwise, if the insertion mode is "in foreign content" |
1187
|
|
|
and the current node is not an element in the HTML namespace |
1188
|
|
|
and the next seven characters are an ASCII case-sensitive |
1189
|
|
|
match for the string "[CDATA[" (the five uppercase letters |
1190
|
|
|
"CDATA" with a U+005B LEFT SQUARE BRACKET character before |
1191
|
|
|
and after), then consume those characters and switch to the |
1192
|
|
|
CDATA section state (which is unrelated to the content model |
1193
|
|
|
flag's CDATA state). */ |
1194
|
|
|
|
1195
|
|
|
/* Otherwise, is is a parse error. Switch to the bogus comment state. |
1196
|
|
|
The next character that is consumed, if any, is the first character |
1197
|
|
|
that will be in the comment. */ |
1198
|
|
|
} else { |
1199
|
|
|
$this->emitToken(array( |
1200
|
|
|
'type' => self::PARSEERROR, |
1201
|
|
|
'data' => 'expected-dashes-or-doctype' |
1202
|
|
|
)); |
1203
|
|
|
$this->token = array( |
1204
|
|
|
'data' => (string) $alpha, |
1205
|
|
|
'type' => self::COMMENT |
1206
|
|
|
); |
1207
|
|
|
$state = 'bogus comment'; |
1208
|
|
|
} |
1209
|
|
|
break; |
1210
|
|
|
|
1211
|
|
|
case 'comment start': |
1212
|
|
|
/* Consume the next input character: */ |
1213
|
|
|
$char = $this->stream->char(); |
1214
|
|
|
|
1215
|
|
|
if ($char === '-') { |
1216
|
|
|
/* U+002D HYPHEN-MINUS (-) |
1217
|
|
|
Switch to the comment start dash state. */ |
1218
|
|
|
$state = 'comment start dash'; |
1219
|
|
|
} elseif ($char === '>') { |
1220
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
1221
|
|
|
Parse error. Emit the comment token. Switch to the |
1222
|
|
|
data state. */ |
1223
|
|
|
$this->emitToken(array( |
1224
|
|
|
'type' => self::PARSEERROR, |
1225
|
|
|
'data' => 'incorrect-comment' |
1226
|
|
|
)); |
1227
|
|
|
$this->emitToken($this->token); |
1228
|
|
|
$state = 'data'; |
1229
|
|
|
} elseif ($char === false) { |
1230
|
|
|
/* EOF |
1231
|
|
|
Parse error. Emit the comment token. Reconsume the |
1232
|
|
|
EOF character in the data state. */ |
1233
|
|
|
$this->emitToken(array( |
1234
|
|
|
'type' => self::PARSEERROR, |
1235
|
|
|
'data' => 'eof-in-comment' |
1236
|
|
|
)); |
1237
|
|
|
$this->emitToken($this->token); |
1238
|
|
|
$this->stream->unget(); |
1239
|
|
|
$state = 'data'; |
1240
|
|
|
} else { |
1241
|
|
|
/* Anything else |
1242
|
|
|
Append the input character to the comment token's |
1243
|
|
|
data. Switch to the comment state. */ |
1244
|
|
|
$this->token['data'] .= $char; |
1245
|
|
|
$state = 'comment'; |
1246
|
|
|
} |
1247
|
|
|
break; |
1248
|
|
|
|
1249
|
|
|
case 'comment start dash': |
1250
|
|
|
/* Consume the next input character: */ |
1251
|
|
|
$char = $this->stream->char(); |
1252
|
|
|
if ($char === '-') { |
1253
|
|
|
/* U+002D HYPHEN-MINUS (-) |
1254
|
|
|
Switch to the comment end state */ |
1255
|
|
|
$state = 'comment end'; |
1256
|
|
|
} elseif ($char === '>') { |
1257
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
1258
|
|
|
Parse error. Emit the comment token. Switch to the |
1259
|
|
|
data state. */ |
1260
|
|
|
$this->emitToken(array( |
1261
|
|
|
'type' => self::PARSEERROR, |
1262
|
|
|
'data' => 'incorrect-comment' |
1263
|
|
|
)); |
1264
|
|
|
$this->emitToken($this->token); |
1265
|
|
|
$state = 'data'; |
1266
|
|
|
} elseif ($char === false) { |
1267
|
|
|
/* Parse error. Emit the comment token. Reconsume the |
1268
|
|
|
EOF character in the data state. */ |
1269
|
|
|
$this->emitToken(array( |
1270
|
|
|
'type' => self::PARSEERROR, |
1271
|
|
|
'data' => 'eof-in-comment' |
1272
|
|
|
)); |
1273
|
|
|
$this->emitToken($this->token); |
1274
|
|
|
$this->stream->unget(); |
1275
|
|
|
$state = 'data'; |
1276
|
|
|
} else { |
1277
|
|
|
$this->token['data'] .= '-' . $char; |
1278
|
|
|
$state = 'comment'; |
1279
|
|
|
} |
1280
|
|
|
break; |
1281
|
|
|
|
1282
|
|
|
case 'comment': |
1283
|
|
|
/* Consume the next input character: */ |
1284
|
|
|
$char = $this->stream->char(); |
1285
|
|
|
|
1286
|
|
|
if($char === '-') { |
1287
|
|
|
/* U+002D HYPHEN-MINUS (-) |
1288
|
|
|
Switch to the comment end dash state */ |
1289
|
|
|
$state = 'comment end dash'; |
1290
|
|
|
|
1291
|
|
|
} elseif($char === false) { |
1292
|
|
|
/* EOF |
1293
|
|
|
Parse error. Emit the comment token. Reconsume the EOF character |
1294
|
|
|
in the data state. */ |
1295
|
|
|
$this->emitToken(array( |
1296
|
|
|
'type' => self::PARSEERROR, |
1297
|
|
|
'data' => 'eof-in-comment' |
1298
|
|
|
)); |
1299
|
|
|
$this->emitToken($this->token); |
1300
|
|
|
$this->stream->unget(); |
1301
|
|
|
$state = 'data'; |
1302
|
|
|
|
1303
|
|
|
} else { |
1304
|
|
|
/* Anything else |
1305
|
|
|
Append the input character to the comment token's data. Stay in |
1306
|
|
|
the comment state. */ |
1307
|
|
|
$chars = $this->stream->charsUntil('-'); |
1308
|
|
|
|
1309
|
|
|
$this->token['data'] .= $char . $chars; |
1310
|
|
|
} |
1311
|
|
|
break; |
1312
|
|
|
|
1313
|
|
|
case 'comment end dash': |
1314
|
|
|
/* Consume the next input character: */ |
1315
|
|
|
$char = $this->stream->char(); |
1316
|
|
|
|
1317
|
|
|
if($char === '-') { |
1318
|
|
|
/* U+002D HYPHEN-MINUS (-) |
1319
|
|
|
Switch to the comment end state */ |
1320
|
|
|
$state = 'comment end'; |
1321
|
|
|
|
1322
|
|
|
} elseif($char === false) { |
1323
|
|
|
/* EOF |
1324
|
|
|
Parse error. Emit the comment token. Reconsume the EOF character |
1325
|
|
|
in the data state. */ |
1326
|
|
|
$this->emitToken(array( |
1327
|
|
|
'type' => self::PARSEERROR, |
1328
|
|
|
'data' => 'eof-in-comment-end-dash' |
1329
|
|
|
)); |
1330
|
|
|
$this->emitToken($this->token); |
1331
|
|
|
$this->stream->unget(); |
1332
|
|
|
$state = 'data'; |
1333
|
|
|
|
1334
|
|
|
} else { |
1335
|
|
|
/* Anything else |
1336
|
|
|
Append a U+002D HYPHEN-MINUS (-) character and the input |
1337
|
|
|
character to the comment token's data. Switch to the comment state. */ |
1338
|
|
|
$this->token['data'] .= '-'.$char; |
1339
|
|
|
$state = 'comment'; |
1340
|
|
|
} |
1341
|
|
|
break; |
1342
|
|
|
|
1343
|
|
|
case 'comment end': |
1344
|
|
|
/* Consume the next input character: */ |
1345
|
|
|
$char = $this->stream->char(); |
1346
|
|
|
|
1347
|
|
|
if($char === '>') { |
1348
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
1349
|
|
|
Emit the comment token. Switch to the data state. */ |
1350
|
|
|
$this->emitToken($this->token); |
1351
|
|
|
$state = 'data'; |
1352
|
|
|
|
1353
|
|
|
} elseif($char === '-') { |
1354
|
|
|
/* U+002D HYPHEN-MINUS (-) |
1355
|
|
|
Parse error. Append a U+002D HYPHEN-MINUS (-) character |
1356
|
|
|
to the comment token's data. Stay in the comment end |
1357
|
|
|
state. */ |
1358
|
|
|
$this->emitToken(array( |
1359
|
|
|
'type' => self::PARSEERROR, |
1360
|
|
|
'data' => 'unexpected-dash-after-double-dash-in-comment' |
1361
|
|
|
)); |
1362
|
|
|
$this->token['data'] .= '-'; |
1363
|
|
|
|
1364
|
|
|
} elseif($char === "\t" || $char === "\n" || $char === "\x0a" || $char === ' ') { |
1365
|
|
|
$this->emitToken(array( |
1366
|
|
|
'type' => self::PARSEERROR, |
1367
|
|
|
'data' => 'unexpected-space-after-double-dash-in-comment' |
1368
|
|
|
)); |
1369
|
|
|
$this->token['data'] .= '--' . $char; |
1370
|
|
|
$state = 'comment end space'; |
1371
|
|
|
|
1372
|
|
|
} elseif($char === '!') { |
1373
|
|
|
$this->emitToken(array( |
1374
|
|
|
'type' => self::PARSEERROR, |
1375
|
|
|
'data' => 'unexpected-bang-after-double-dash-in-comment' |
1376
|
|
|
)); |
1377
|
|
|
$state = 'comment end bang'; |
1378
|
|
|
|
1379
|
|
|
} elseif($char === false) { |
1380
|
|
|
/* EOF |
1381
|
|
|
Parse error. Emit the comment token. Reconsume the |
1382
|
|
|
EOF character in the data state. */ |
1383
|
|
|
$this->emitToken(array( |
1384
|
|
|
'type' => self::PARSEERROR, |
1385
|
|
|
'data' => 'eof-in-comment-double-dash' |
1386
|
|
|
)); |
1387
|
|
|
$this->emitToken($this->token); |
1388
|
|
|
$this->stream->unget(); |
1389
|
|
|
$state = 'data'; |
1390
|
|
|
|
1391
|
|
|
} else { |
1392
|
|
|
/* Anything else |
1393
|
|
|
Parse error. Append two U+002D HYPHEN-MINUS (-) |
1394
|
|
|
characters and the input character to the comment token's |
1395
|
|
|
data. Switch to the comment state. */ |
1396
|
|
|
$this->emitToken(array( |
1397
|
|
|
'type' => self::PARSEERROR, |
1398
|
|
|
'data' => 'unexpected-char-in-comment' |
1399
|
|
|
)); |
1400
|
|
|
$this->token['data'] .= '--'.$char; |
1401
|
|
|
$state = 'comment'; |
1402
|
|
|
} |
1403
|
|
|
break; |
1404
|
|
|
|
1405
|
|
|
case 'comment end bang': |
1406
|
|
|
$char = $this->stream->char(); |
1407
|
|
|
if ($char === '>') { |
1408
|
|
|
$this->emitToken($this->token); |
1409
|
|
|
$state = 'data'; |
1410
|
|
|
} elseif ($char === "-") { |
1411
|
|
|
$this->token['data'] .= '--!'; |
1412
|
|
|
$state = 'comment end dash'; |
1413
|
|
|
} elseif ($char === false) { |
1414
|
|
|
$this->emitToken(array( |
1415
|
|
|
'type' => self::PARSEERROR, |
1416
|
|
|
'data' => 'eof-in-comment-end-bang' |
1417
|
|
|
)); |
1418
|
|
|
$this->emitToken($this->token); |
1419
|
|
|
$this->stream->unget(); |
1420
|
|
|
$state = 'data'; |
1421
|
|
|
} else { |
1422
|
|
|
$this->token['data'] .= '--!' . $char; |
1423
|
|
|
$state = 'comment'; |
1424
|
|
|
} |
1425
|
|
|
break; |
1426
|
|
|
|
1427
|
|
|
case 'comment end space': |
1428
|
|
|
$char = $this->stream->char(); |
1429
|
|
|
if ($char === '>') { |
1430
|
|
|
$this->emitToken($this->token); |
1431
|
|
|
$state = 'data'; |
1432
|
|
|
} elseif ($char === '-') { |
1433
|
|
|
$state = 'comment end dash'; |
1434
|
|
|
} elseif ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { |
1435
|
|
|
$this->token['data'] .= $char; |
1436
|
|
|
} elseif ($char === false) { |
1437
|
|
|
$this->emitToken(array( |
1438
|
|
|
'type' => self::PARSEERROR, |
1439
|
|
|
'data' => 'unexpected-eof-in-comment-end-space', |
1440
|
|
|
)); |
1441
|
|
|
$this->emitToken($this->token); |
1442
|
|
|
$this->stream->unget(); |
1443
|
|
|
$state = 'data'; |
1444
|
|
|
} else { |
1445
|
|
|
$this->token['data'] .= $char; |
1446
|
|
|
$state = 'comment'; |
1447
|
|
|
} |
1448
|
|
|
break; |
1449
|
|
|
|
1450
|
|
|
case 'DOCTYPE': |
1451
|
|
|
/* Consume the next input character: */ |
1452
|
|
|
$char = $this->stream->char(); |
1453
|
|
|
|
1454
|
|
|
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { |
1455
|
|
|
/* U+0009 CHARACTER TABULATION |
1456
|
|
|
U+000A LINE FEED (LF) |
1457
|
|
|
U+000C FORM FEED (FF) |
1458
|
|
|
U+0020 SPACE |
1459
|
|
|
Switch to the before DOCTYPE name state. */ |
1460
|
|
|
$state = 'before DOCTYPE name'; |
1461
|
|
|
|
1462
|
|
|
} elseif($char === false) { |
1463
|
|
|
/* EOF |
1464
|
|
|
Parse error. Create a new DOCTYPE token. Set its |
1465
|
|
|
force-quirks flag to on. Emit the token. Reconsume the |
1466
|
|
|
EOF character in the data state. */ |
1467
|
|
|
$this->emitToken(array( |
1468
|
|
|
'type' => self::PARSEERROR, |
1469
|
|
|
'data' => 'need-space-after-doctype-but-got-eof' |
1470
|
|
|
)); |
1471
|
|
|
$this->emitToken(array( |
1472
|
|
|
'name' => '', |
1473
|
|
|
'type' => self::DOCTYPE, |
1474
|
|
|
'force-quirks' => true, |
1475
|
|
|
'error' => true |
1476
|
|
|
)); |
1477
|
|
|
$this->stream->unget(); |
1478
|
|
|
$state = 'data'; |
1479
|
|
|
|
1480
|
|
|
} else { |
1481
|
|
|
/* Anything else |
1482
|
|
|
Parse error. Reconsume the current character in the |
1483
|
|
|
before DOCTYPE name state. */ |
1484
|
|
|
$this->emitToken(array( |
1485
|
|
|
'type' => self::PARSEERROR, |
1486
|
|
|
'data' => 'need-space-after-doctype' |
1487
|
|
|
)); |
1488
|
|
|
$this->stream->unget(); |
1489
|
|
|
$state = 'before DOCTYPE name'; |
1490
|
|
|
} |
1491
|
|
|
break; |
1492
|
|
|
|
1493
|
|
|
case 'before DOCTYPE name': |
1494
|
|
|
/* Consume the next input character: */ |
1495
|
|
|
$char = $this->stream->char(); |
1496
|
|
|
|
1497
|
|
|
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { |
1498
|
|
|
/* U+0009 CHARACTER TABULATION |
1499
|
|
|
U+000A LINE FEED (LF) |
1500
|
|
|
U+000C FORM FEED (FF) |
1501
|
|
|
U+0020 SPACE |
1502
|
|
|
Stay in the before DOCTYPE name state. */ |
1503
|
|
|
|
1504
|
|
|
} elseif($char === '>') { |
1505
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
1506
|
|
|
Parse error. Create a new DOCTYPE token. Set its |
1507
|
|
|
force-quirks flag to on. Emit the token. Switch to the |
1508
|
|
|
data state. */ |
1509
|
|
|
$this->emitToken(array( |
1510
|
|
|
'type' => self::PARSEERROR, |
1511
|
|
|
'data' => 'expected-doctype-name-but-got-right-bracket' |
1512
|
|
|
)); |
1513
|
|
|
$this->emitToken(array( |
1514
|
|
|
'name' => '', |
1515
|
|
|
'type' => self::DOCTYPE, |
1516
|
|
|
'force-quirks' => true, |
1517
|
|
|
'error' => true |
1518
|
|
|
)); |
1519
|
|
|
|
1520
|
|
|
$state = 'data'; |
1521
|
|
|
|
1522
|
|
|
} elseif('A' <= $char && $char <= 'Z') { |
1523
|
|
|
/* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z |
1524
|
|
|
Create a new DOCTYPE token. Set the token's name to the |
1525
|
|
|
lowercase version of the input character (add 0x0020 to |
1526
|
|
|
the character's code point). Switch to the DOCTYPE name |
1527
|
|
|
state. */ |
1528
|
|
|
$this->token = array( |
1529
|
|
|
'name' => strtolower($char), |
1530
|
|
|
'type' => self::DOCTYPE, |
1531
|
|
|
'error' => true |
1532
|
|
|
); |
1533
|
|
|
|
1534
|
|
|
$state = 'DOCTYPE name'; |
1535
|
|
|
|
1536
|
|
|
} elseif($char === false) { |
1537
|
|
|
/* EOF |
1538
|
|
|
Parse error. Create a new DOCTYPE token. Set its |
1539
|
|
|
force-quirks flag to on. Emit the token. Reconsume the |
1540
|
|
|
EOF character in the data state. */ |
1541
|
|
|
$this->emitToken(array( |
1542
|
|
|
'type' => self::PARSEERROR, |
1543
|
|
|
'data' => 'expected-doctype-name-but-got-eof' |
1544
|
|
|
)); |
1545
|
|
|
$this->emitToken(array( |
1546
|
|
|
'name' => '', |
1547
|
|
|
'type' => self::DOCTYPE, |
1548
|
|
|
'force-quirks' => true, |
1549
|
|
|
'error' => true |
1550
|
|
|
)); |
1551
|
|
|
|
1552
|
|
|
$this->stream->unget(); |
1553
|
|
|
$state = 'data'; |
1554
|
|
|
|
1555
|
|
|
} else { |
1556
|
|
|
/* Anything else |
1557
|
|
|
Create a new DOCTYPE token. Set the token's name to the |
1558
|
|
|
current input character. Switch to the DOCTYPE name state. */ |
1559
|
|
|
$this->token = array( |
1560
|
|
|
'name' => $char, |
1561
|
|
|
'type' => self::DOCTYPE, |
1562
|
|
|
'error' => true |
1563
|
|
|
); |
1564
|
|
|
|
1565
|
|
|
$state = 'DOCTYPE name'; |
1566
|
|
|
} |
1567
|
|
|
break; |
1568
|
|
|
|
1569
|
|
|
case 'DOCTYPE name': |
1570
|
|
|
/* Consume the next input character: */ |
1571
|
|
|
$char = $this->stream->char(); |
1572
|
|
|
|
1573
|
|
|
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { |
1574
|
|
|
/* U+0009 CHARACTER TABULATION |
1575
|
|
|
U+000A LINE FEED (LF) |
1576
|
|
|
U+000C FORM FEED (FF) |
1577
|
|
|
U+0020 SPACE |
1578
|
|
|
Switch to the after DOCTYPE name state. */ |
1579
|
|
|
$state = 'after DOCTYPE name'; |
1580
|
|
|
|
1581
|
|
|
} elseif($char === '>') { |
1582
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
1583
|
|
|
Emit the current DOCTYPE token. Switch to the data state. */ |
1584
|
|
|
$this->emitToken($this->token); |
1585
|
|
|
$state = 'data'; |
1586
|
|
|
|
1587
|
|
|
} elseif('A' <= $char && $char <= 'Z') { |
1588
|
|
|
/* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z |
1589
|
|
|
Append the lowercase version of the input character |
1590
|
|
|
(add 0x0020 to the character's code point) to the current |
1591
|
|
|
DOCTYPE token's name. Stay in the DOCTYPE name state. */ |
1592
|
|
|
$this->token['name'] .= strtolower($char); |
1593
|
|
|
|
1594
|
|
|
} elseif($char === false) { |
1595
|
|
|
/* EOF |
1596
|
|
|
Parse error. Set the DOCTYPE token's force-quirks flag |
1597
|
|
|
to on. Emit that DOCTYPE token. Reconsume the EOF |
1598
|
|
|
character in the data state. */ |
1599
|
|
|
$this->emitToken(array( |
1600
|
|
|
'type' => self::PARSEERROR, |
1601
|
|
|
'data' => 'eof-in-doctype-name' |
1602
|
|
|
)); |
1603
|
|
|
$this->token['force-quirks'] = true; |
1604
|
|
|
$this->emitToken($this->token); |
1605
|
|
|
$this->stream->unget(); |
1606
|
|
|
$state = 'data'; |
1607
|
|
|
|
1608
|
|
|
} else { |
1609
|
|
|
/* Anything else |
1610
|
|
|
Append the current input character to the current |
1611
|
|
|
DOCTYPE token's name. Stay in the DOCTYPE name state. */ |
1612
|
|
|
$this->token['name'] .= $char; |
1613
|
|
|
} |
1614
|
|
|
|
1615
|
|
|
// XXX this is probably some sort of quirks mode designation, |
1616
|
|
|
// check tree-builder to be sure. In general 'error' needs |
1617
|
|
|
// to be specc'ified, this probably means removing it at the end |
1618
|
|
|
$this->token['error'] = ($this->token['name'] === 'HTML') |
1619
|
|
|
? false |
1620
|
|
|
: true; |
1621
|
|
|
break; |
1622
|
|
|
|
1623
|
|
|
case 'after DOCTYPE name': |
1624
|
|
|
/* Consume the next input character: */ |
1625
|
|
|
$char = $this->stream->char(); |
1626
|
|
|
|
1627
|
|
|
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { |
1628
|
|
|
/* U+0009 CHARACTER TABULATION |
1629
|
|
|
U+000A LINE FEED (LF) |
1630
|
|
|
U+000C FORM FEED (FF) |
1631
|
|
|
U+0020 SPACE |
1632
|
|
|
Stay in the after DOCTYPE name state. */ |
1633
|
|
|
|
1634
|
|
|
} elseif($char === '>') { |
1635
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
1636
|
|
|
Emit the current DOCTYPE token. Switch to the data state. */ |
1637
|
|
|
$this->emitToken($this->token); |
1638
|
|
|
$state = 'data'; |
1639
|
|
|
|
1640
|
|
|
} elseif($char === false) { |
1641
|
|
|
/* EOF |
1642
|
|
|
Parse error. Set the DOCTYPE token's force-quirks flag |
1643
|
|
|
to on. Emit that DOCTYPE token. Reconsume the EOF |
1644
|
|
|
character in the data state. */ |
1645
|
|
|
$this->emitToken(array( |
1646
|
|
|
'type' => self::PARSEERROR, |
1647
|
|
|
'data' => 'eof-in-doctype' |
1648
|
|
|
)); |
1649
|
|
|
$this->token['force-quirks'] = true; |
1650
|
|
|
$this->emitToken($this->token); |
1651
|
|
|
$this->stream->unget(); |
1652
|
|
|
$state = 'data'; |
1653
|
|
|
|
1654
|
|
|
} else { |
1655
|
|
|
/* Anything else */ |
1656
|
|
|
|
1657
|
|
|
$nextSix = strtoupper($char . $this->stream->charsWhile(self::ALPHA, 5)); |
1658
|
|
|
if ($nextSix === 'PUBLIC') { |
1659
|
|
|
/* If the next six characters are an ASCII |
1660
|
|
|
case-insensitive match for the word "PUBLIC", then |
1661
|
|
|
consume those characters and switch to the before |
1662
|
|
|
DOCTYPE public identifier state. */ |
1663
|
|
|
$state = 'before DOCTYPE public identifier'; |
1664
|
|
|
|
1665
|
|
|
} elseif ($nextSix === 'SYSTEM') { |
1666
|
|
|
/* Otherwise, if the next six characters are an ASCII |
1667
|
|
|
case-insensitive match for the word "SYSTEM", then |
1668
|
|
|
consume those characters and switch to the before |
1669
|
|
|
DOCTYPE system identifier state. */ |
1670
|
|
|
$state = 'before DOCTYPE system identifier'; |
1671
|
|
|
|
1672
|
|
|
} else { |
1673
|
|
|
/* Otherwise, this is the parse error. Set the DOCTYPE |
1674
|
|
|
token's force-quirks flag to on. Switch to the bogus |
1675
|
|
|
DOCTYPE state. */ |
1676
|
|
|
$this->emitToken(array( |
1677
|
|
|
'type' => self::PARSEERROR, |
1678
|
|
|
'data' => 'expected-space-or-right-bracket-in-doctype' |
1679
|
|
|
)); |
1680
|
|
|
$this->token['force-quirks'] = true; |
1681
|
|
|
$this->token['error'] = true; |
1682
|
|
|
$state = 'bogus DOCTYPE'; |
1683
|
|
|
} |
1684
|
|
|
} |
1685
|
|
|
break; |
1686
|
|
|
|
1687
|
|
|
case 'before DOCTYPE public identifier': |
1688
|
|
|
/* Consume the next input character: */ |
1689
|
|
|
$char = $this->stream->char(); |
1690
|
|
|
|
1691
|
|
|
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { |
1692
|
|
|
/* U+0009 CHARACTER TABULATION |
1693
|
|
|
U+000A LINE FEED (LF) |
1694
|
|
|
U+000C FORM FEED (FF) |
1695
|
|
|
U+0020 SPACE |
1696
|
|
|
Stay in the before DOCTYPE public identifier state. */ |
1697
|
|
|
} elseif ($char === '"') { |
1698
|
|
|
/* U+0022 QUOTATION MARK (") |
1699
|
|
|
Set the DOCTYPE token's public identifier to the empty |
1700
|
|
|
string (not missing), then switch to the DOCTYPE public |
1701
|
|
|
identifier (double-quoted) state. */ |
1702
|
|
|
$this->token['public'] = ''; |
1703
|
|
|
$state = 'DOCTYPE public identifier (double-quoted)'; |
1704
|
|
|
} elseif ($char === "'") { |
1705
|
|
|
/* U+0027 APOSTROPHE (') |
1706
|
|
|
Set the DOCTYPE token's public identifier to the empty |
1707
|
|
|
string (not missing), then switch to the DOCTYPE public |
1708
|
|
|
identifier (single-quoted) state. */ |
1709
|
|
|
$this->token['public'] = ''; |
1710
|
|
|
$state = 'DOCTYPE public identifier (single-quoted)'; |
1711
|
|
|
} elseif ($char === '>') { |
1712
|
|
|
/* Parse error. Set the DOCTYPE token's force-quirks flag |
1713
|
|
|
to on. Emit that DOCTYPE token. Switch to the data state. */ |
1714
|
|
|
$this->emitToken(array( |
1715
|
|
|
'type' => self::PARSEERROR, |
1716
|
|
|
'data' => 'unexpected-end-of-doctype' |
1717
|
|
|
)); |
1718
|
|
|
$this->token['force-quirks'] = true; |
1719
|
|
|
$this->emitToken($this->token); |
1720
|
|
|
$state = 'data'; |
1721
|
|
|
} elseif ($char === false) { |
1722
|
|
|
/* Parse error. Set the DOCTYPE token's force-quirks |
1723
|
|
|
flag to on. Emit that DOCTYPE token. Reconsume the EOF |
1724
|
|
|
character in the data state. */ |
1725
|
|
|
$this->emitToken(array( |
1726
|
|
|
'type' => self::PARSEERROR, |
1727
|
|
|
'data' => 'eof-in-doctype' |
1728
|
|
|
)); |
1729
|
|
|
$this->token['force-quirks'] = true; |
1730
|
|
|
$this->emitToken($this->token); |
1731
|
|
|
$this->stream->unget(); |
1732
|
|
|
$state = 'data'; |
1733
|
|
|
} else { |
1734
|
|
|
/* Parse error. Set the DOCTYPE token's force-quirks flag |
1735
|
|
|
to on. Switch to the bogus DOCTYPE state. */ |
1736
|
|
|
$this->emitToken(array( |
1737
|
|
|
'type' => self::PARSEERROR, |
1738
|
|
|
'data' => 'unexpected-char-in-doctype' |
1739
|
|
|
)); |
1740
|
|
|
$this->token['force-quirks'] = true; |
1741
|
|
|
$state = 'bogus DOCTYPE'; |
1742
|
|
|
} |
1743
|
|
|
break; |
1744
|
|
|
|
1745
|
|
|
case 'DOCTYPE public identifier (double-quoted)': |
1746
|
|
|
/* Consume the next input character: */ |
1747
|
|
|
$char = $this->stream->char(); |
1748
|
|
|
|
1749
|
|
|
if ($char === '"') { |
1750
|
|
|
/* U+0022 QUOTATION MARK (") |
1751
|
|
|
Switch to the after DOCTYPE public identifier state. */ |
1752
|
|
|
$state = 'after DOCTYPE public identifier'; |
1753
|
|
|
} elseif ($char === '>') { |
1754
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
1755
|
|
|
Parse error. Set the DOCTYPE token's force-quirks flag |
1756
|
|
|
to on. Emit that DOCTYPE token. Switch to the data state. */ |
1757
|
|
|
$this->emitToken(array( |
1758
|
|
|
'type' => self::PARSEERROR, |
1759
|
|
|
'data' => 'unexpected-end-of-doctype' |
1760
|
|
|
)); |
1761
|
|
|
$this->token['force-quirks'] = true; |
1762
|
|
|
$this->emitToken($this->token); |
1763
|
|
|
$state = 'data'; |
1764
|
|
|
} elseif ($char === false) { |
1765
|
|
|
/* EOF |
1766
|
|
|
Parse error. Set the DOCTYPE token's force-quirks flag |
1767
|
|
|
to on. Emit that DOCTYPE token. Reconsume the EOF |
1768
|
|
|
character in the data state. */ |
1769
|
|
|
$this->emitToken(array( |
1770
|
|
|
'type' => self::PARSEERROR, |
1771
|
|
|
'data' => 'eof-in-doctype' |
1772
|
|
|
)); |
1773
|
|
|
$this->token['force-quirks'] = true; |
1774
|
|
|
$this->emitToken($this->token); |
1775
|
|
|
$this->stream->unget(); |
1776
|
|
|
$state = 'data'; |
1777
|
|
|
} else { |
1778
|
|
|
/* Anything else |
1779
|
|
|
Append the current input character to the current |
1780
|
|
|
DOCTYPE token's public identifier. Stay in the DOCTYPE |
1781
|
|
|
public identifier (double-quoted) state. */ |
1782
|
|
|
$this->token['public'] .= $char; |
1783
|
|
|
} |
1784
|
|
|
break; |
1785
|
|
|
|
1786
|
|
|
case 'DOCTYPE public identifier (single-quoted)': |
1787
|
|
|
/* Consume the next input character: */ |
1788
|
|
|
$char = $this->stream->char(); |
1789
|
|
|
|
1790
|
|
|
if ($char === "'") { |
1791
|
|
|
/* U+0027 APOSTROPHE (') |
1792
|
|
|
Switch to the after DOCTYPE public identifier state. */ |
1793
|
|
|
$state = 'after DOCTYPE public identifier'; |
1794
|
|
|
} elseif ($char === '>') { |
1795
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
1796
|
|
|
Parse error. Set the DOCTYPE token's force-quirks flag |
1797
|
|
|
to on. Emit that DOCTYPE token. Switch to the data state. */ |
1798
|
|
|
$this->emitToken(array( |
1799
|
|
|
'type' => self::PARSEERROR, |
1800
|
|
|
'data' => 'unexpected-end-of-doctype' |
1801
|
|
|
)); |
1802
|
|
|
$this->token['force-quirks'] = true; |
1803
|
|
|
$this->emitToken($this->token); |
1804
|
|
|
$state = 'data'; |
1805
|
|
|
} elseif ($char === false) { |
1806
|
|
|
/* EOF |
1807
|
|
|
Parse error. Set the DOCTYPE token's force-quirks flag |
1808
|
|
|
to on. Emit that DOCTYPE token. Reconsume the EOF |
1809
|
|
|
character in the data state. */ |
1810
|
|
|
$this->emitToken(array( |
1811
|
|
|
'type' => self::PARSEERROR, |
1812
|
|
|
'data' => 'eof-in-doctype' |
1813
|
|
|
)); |
1814
|
|
|
$this->token['force-quirks'] = true; |
1815
|
|
|
$this->emitToken($this->token); |
1816
|
|
|
$this->stream->unget(); |
1817
|
|
|
$state = 'data'; |
1818
|
|
|
} else { |
1819
|
|
|
/* Anything else |
1820
|
|
|
Append the current input character to the current |
1821
|
|
|
DOCTYPE token's public identifier. Stay in the DOCTYPE |
1822
|
|
|
public identifier (double-quoted) state. */ |
1823
|
|
|
$this->token['public'] .= $char; |
1824
|
|
|
} |
1825
|
|
|
break; |
1826
|
|
|
|
1827
|
|
|
case 'after DOCTYPE public identifier': |
1828
|
|
|
/* Consume the next input character: */ |
1829
|
|
|
$char = $this->stream->char(); |
1830
|
|
|
|
1831
|
|
|
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { |
1832
|
|
|
/* U+0009 CHARACTER TABULATION |
1833
|
|
|
U+000A LINE FEED (LF) |
1834
|
|
|
U+000C FORM FEED (FF) |
1835
|
|
|
U+0020 SPACE |
1836
|
|
|
Stay in the after DOCTYPE public identifier state. */ |
1837
|
|
|
} elseif ($char === '"') { |
1838
|
|
|
/* U+0022 QUOTATION MARK (") |
1839
|
|
|
Set the DOCTYPE token's system identifier to the |
1840
|
|
|
empty string (not missing), then switch to the DOCTYPE |
1841
|
|
|
system identifier (double-quoted) state. */ |
1842
|
|
|
$this->token['system'] = ''; |
1843
|
|
|
$state = 'DOCTYPE system identifier (double-quoted)'; |
1844
|
|
|
} elseif ($char === "'") { |
1845
|
|
|
/* U+0027 APOSTROPHE (') |
1846
|
|
|
Set the DOCTYPE token's system identifier to the |
1847
|
|
|
empty string (not missing), then switch to the DOCTYPE |
1848
|
|
|
system identifier (single-quoted) state. */ |
1849
|
|
|
$this->token['system'] = ''; |
1850
|
|
|
$state = 'DOCTYPE system identifier (single-quoted)'; |
1851
|
|
|
} elseif ($char === '>') { |
1852
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
1853
|
|
|
Emit the current DOCTYPE token. Switch to the data state. */ |
1854
|
|
|
$this->emitToken($this->token); |
1855
|
|
|
$state = 'data'; |
1856
|
|
|
} elseif ($char === false) { |
1857
|
|
|
/* Parse error. Set the DOCTYPE token's force-quirks |
1858
|
|
|
flag to on. Emit that DOCTYPE token. Reconsume the EOF |
1859
|
|
|
character in the data state. */ |
1860
|
|
|
$this->emitToken(array( |
1861
|
|
|
'type' => self::PARSEERROR, |
1862
|
|
|
'data' => 'eof-in-doctype' |
1863
|
|
|
)); |
1864
|
|
|
$this->token['force-quirks'] = true; |
1865
|
|
|
$this->emitToken($this->token); |
1866
|
|
|
$this->stream->unget(); |
1867
|
|
|
$state = 'data'; |
1868
|
|
|
} else { |
1869
|
|
|
/* Anything else |
1870
|
|
|
Parse error. Set the DOCTYPE token's force-quirks flag |
1871
|
|
|
to on. Switch to the bogus DOCTYPE state. */ |
1872
|
|
|
$this->emitToken(array( |
1873
|
|
|
'type' => self::PARSEERROR, |
1874
|
|
|
'data' => 'unexpected-char-in-doctype' |
1875
|
|
|
)); |
1876
|
|
|
$this->token['force-quirks'] = true; |
1877
|
|
|
$state = 'bogus DOCTYPE'; |
1878
|
|
|
} |
1879
|
|
|
break; |
1880
|
|
|
|
1881
|
|
|
case 'before DOCTYPE system identifier': |
1882
|
|
|
/* Consume the next input character: */ |
1883
|
|
|
$char = $this->stream->char(); |
1884
|
|
|
|
1885
|
|
|
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { |
1886
|
|
|
/* U+0009 CHARACTER TABULATION |
1887
|
|
|
U+000A LINE FEED (LF) |
1888
|
|
|
U+000C FORM FEED (FF) |
1889
|
|
|
U+0020 SPACE |
1890
|
|
|
Stay in the before DOCTYPE system identifier state. */ |
1891
|
|
|
} elseif ($char === '"') { |
1892
|
|
|
/* U+0022 QUOTATION MARK (") |
1893
|
|
|
Set the DOCTYPE token's system identifier to the empty |
1894
|
|
|
string (not missing), then switch to the DOCTYPE system |
1895
|
|
|
identifier (double-quoted) state. */ |
1896
|
|
|
$this->token['system'] = ''; |
1897
|
|
|
$state = 'DOCTYPE system identifier (double-quoted)'; |
1898
|
|
|
} elseif ($char === "'") { |
1899
|
|
|
/* U+0027 APOSTROPHE (') |
1900
|
|
|
Set the DOCTYPE token's system identifier to the empty |
1901
|
|
|
string (not missing), then switch to the DOCTYPE system |
1902
|
|
|
identifier (single-quoted) state. */ |
1903
|
|
|
$this->token['system'] = ''; |
1904
|
|
|
$state = 'DOCTYPE system identifier (single-quoted)'; |
1905
|
|
|
} elseif ($char === '>') { |
1906
|
|
|
/* Parse error. Set the DOCTYPE token's force-quirks flag |
1907
|
|
|
to on. Emit that DOCTYPE token. Switch to the data state. */ |
1908
|
|
|
$this->emitToken(array( |
1909
|
|
|
'type' => self::PARSEERROR, |
1910
|
|
|
'data' => 'unexpected-char-in-doctype' |
1911
|
|
|
)); |
1912
|
|
|
$this->token['force-quirks'] = true; |
1913
|
|
|
$this->emitToken($this->token); |
1914
|
|
|
$state = 'data'; |
1915
|
|
|
} elseif ($char === false) { |
1916
|
|
|
/* Parse error. Set the DOCTYPE token's force-quirks |
1917
|
|
|
flag to on. Emit that DOCTYPE token. Reconsume the EOF |
1918
|
|
|
character in the data state. */ |
1919
|
|
|
$this->emitToken(array( |
1920
|
|
|
'type' => self::PARSEERROR, |
1921
|
|
|
'data' => 'eof-in-doctype' |
1922
|
|
|
)); |
1923
|
|
|
$this->token['force-quirks'] = true; |
1924
|
|
|
$this->emitToken($this->token); |
1925
|
|
|
$this->stream->unget(); |
1926
|
|
|
$state = 'data'; |
1927
|
|
|
} else { |
1928
|
|
|
/* Parse error. Set the DOCTYPE token's force-quirks flag |
1929
|
|
|
to on. Switch to the bogus DOCTYPE state. */ |
1930
|
|
|
$this->emitToken(array( |
1931
|
|
|
'type' => self::PARSEERROR, |
1932
|
|
|
'data' => 'unexpected-char-in-doctype' |
1933
|
|
|
)); |
1934
|
|
|
$this->token['force-quirks'] = true; |
1935
|
|
|
$state = 'bogus DOCTYPE'; |
1936
|
|
|
} |
1937
|
|
|
break; |
1938
|
|
|
|
1939
|
|
|
case 'DOCTYPE system identifier (double-quoted)': |
1940
|
|
|
/* Consume the next input character: */ |
1941
|
|
|
$char = $this->stream->char(); |
1942
|
|
|
|
1943
|
|
|
if ($char === '"') { |
1944
|
|
|
/* U+0022 QUOTATION MARK (") |
1945
|
|
|
Switch to the after DOCTYPE system identifier state. */ |
1946
|
|
|
$state = 'after DOCTYPE system identifier'; |
1947
|
|
|
} elseif ($char === '>') { |
1948
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
1949
|
|
|
Parse error. Set the DOCTYPE token's force-quirks flag |
1950
|
|
|
to on. Emit that DOCTYPE token. Switch to the data state. */ |
1951
|
|
|
$this->emitToken(array( |
1952
|
|
|
'type' => self::PARSEERROR, |
1953
|
|
|
'data' => 'unexpected-end-of-doctype' |
1954
|
|
|
)); |
1955
|
|
|
$this->token['force-quirks'] = true; |
1956
|
|
|
$this->emitToken($this->token); |
1957
|
|
|
$state = 'data'; |
1958
|
|
|
} elseif ($char === false) { |
1959
|
|
|
/* EOF |
1960
|
|
|
Parse error. Set the DOCTYPE token's force-quirks flag |
1961
|
|
|
to on. Emit that DOCTYPE token. Reconsume the EOF |
1962
|
|
|
character in the data state. */ |
1963
|
|
|
$this->emitToken(array( |
1964
|
|
|
'type' => self::PARSEERROR, |
1965
|
|
|
'data' => 'eof-in-doctype' |
1966
|
|
|
)); |
1967
|
|
|
$this->token['force-quirks'] = true; |
1968
|
|
|
$this->emitToken($this->token); |
1969
|
|
|
$this->stream->unget(); |
1970
|
|
|
$state = 'data'; |
1971
|
|
|
} else { |
1972
|
|
|
/* Anything else |
1973
|
|
|
Append the current input character to the current |
1974
|
|
|
DOCTYPE token's system identifier. Stay in the DOCTYPE |
1975
|
|
|
system identifier (double-quoted) state. */ |
1976
|
|
|
$this->token['system'] .= $char; |
1977
|
|
|
} |
1978
|
|
|
break; |
1979
|
|
|
|
1980
|
|
|
case 'DOCTYPE system identifier (single-quoted)': |
1981
|
|
|
/* Consume the next input character: */ |
1982
|
|
|
$char = $this->stream->char(); |
1983
|
|
|
|
1984
|
|
|
if ($char === "'") { |
1985
|
|
|
/* U+0027 APOSTROPHE (') |
1986
|
|
|
Switch to the after DOCTYPE system identifier state. */ |
1987
|
|
|
$state = 'after DOCTYPE system identifier'; |
1988
|
|
|
} elseif ($char === '>') { |
1989
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
1990
|
|
|
Parse error. Set the DOCTYPE token's force-quirks flag |
1991
|
|
|
to on. Emit that DOCTYPE token. Switch to the data state. */ |
1992
|
|
|
$this->emitToken(array( |
1993
|
|
|
'type' => self::PARSEERROR, |
1994
|
|
|
'data' => 'unexpected-end-of-doctype' |
1995
|
|
|
)); |
1996
|
|
|
$this->token['force-quirks'] = true; |
1997
|
|
|
$this->emitToken($this->token); |
1998
|
|
|
$state = 'data'; |
1999
|
|
|
} elseif ($char === false) { |
2000
|
|
|
/* EOF |
2001
|
|
|
Parse error. Set the DOCTYPE token's force-quirks flag |
2002
|
|
|
to on. Emit that DOCTYPE token. Reconsume the EOF |
2003
|
|
|
character in the data state. */ |
2004
|
|
|
$this->emitToken(array( |
2005
|
|
|
'type' => self::PARSEERROR, |
2006
|
|
|
'data' => 'eof-in-doctype' |
2007
|
|
|
)); |
2008
|
|
|
$this->token['force-quirks'] = true; |
2009
|
|
|
$this->emitToken($this->token); |
2010
|
|
|
$this->stream->unget(); |
2011
|
|
|
$state = 'data'; |
2012
|
|
|
} else { |
2013
|
|
|
/* Anything else |
2014
|
|
|
Append the current input character to the current |
2015
|
|
|
DOCTYPE token's system identifier. Stay in the DOCTYPE |
2016
|
|
|
system identifier (double-quoted) state. */ |
2017
|
|
|
$this->token['system'] .= $char; |
2018
|
|
|
} |
2019
|
|
|
break; |
2020
|
|
|
|
2021
|
|
|
case 'after DOCTYPE system identifier': |
2022
|
|
|
/* Consume the next input character: */ |
2023
|
|
|
$char = $this->stream->char(); |
2024
|
|
|
|
2025
|
|
|
if($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { |
2026
|
|
|
/* U+0009 CHARACTER TABULATION |
2027
|
|
|
U+000A LINE FEED (LF) |
2028
|
|
|
U+000C FORM FEED (FF) |
2029
|
|
|
U+0020 SPACE |
2030
|
|
|
Stay in the after DOCTYPE system identifier state. */ |
2031
|
|
|
} elseif ($char === '>') { |
2032
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
2033
|
|
|
Emit the current DOCTYPE token. Switch to the data state. */ |
2034
|
|
|
$this->emitToken($this->token); |
2035
|
|
|
$state = 'data'; |
2036
|
|
|
} elseif ($char === false) { |
2037
|
|
|
/* Parse error. Set the DOCTYPE token's force-quirks |
2038
|
|
|
flag to on. Emit that DOCTYPE token. Reconsume the EOF |
2039
|
|
|
character in the data state. */ |
2040
|
|
|
$this->emitToken(array( |
2041
|
|
|
'type' => self::PARSEERROR, |
2042
|
|
|
'data' => 'eof-in-doctype' |
2043
|
|
|
)); |
2044
|
|
|
$this->token['force-quirks'] = true; |
2045
|
|
|
$this->emitToken($this->token); |
2046
|
|
|
$this->stream->unget(); |
2047
|
|
|
$state = 'data'; |
2048
|
|
|
} else { |
2049
|
|
|
/* Anything else |
2050
|
|
|
Parse error. Switch to the bogus DOCTYPE state. |
2051
|
|
|
(This does not set the DOCTYPE token's force-quirks |
2052
|
|
|
flag to on.) */ |
2053
|
|
|
$this->emitToken(array( |
2054
|
|
|
'type' => self::PARSEERROR, |
2055
|
|
|
'data' => 'unexpected-char-in-doctype' |
2056
|
|
|
)); |
2057
|
|
|
$state = 'bogus DOCTYPE'; |
2058
|
|
|
} |
2059
|
|
|
break; |
2060
|
|
|
|
2061
|
|
|
case 'bogus DOCTYPE': |
2062
|
|
|
/* Consume the next input character: */ |
2063
|
|
|
$char = $this->stream->char(); |
2064
|
|
|
|
2065
|
|
|
if ($char === '>') { |
2066
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
2067
|
|
|
Emit the DOCTYPE token. Switch to the data state. */ |
2068
|
|
|
$this->emitToken($this->token); |
2069
|
|
|
$state = 'data'; |
2070
|
|
|
|
2071
|
|
|
} elseif($char === false) { |
2072
|
|
|
/* EOF |
2073
|
|
|
Emit the DOCTYPE token. Reconsume the EOF character in |
2074
|
|
|
the data state. */ |
2075
|
|
|
$this->emitToken($this->token); |
2076
|
|
|
$this->stream->unget(); |
2077
|
|
|
$state = 'data'; |
2078
|
|
|
|
2079
|
|
|
} else { |
2080
|
|
|
/* Anything else |
2081
|
|
|
Stay in the bogus DOCTYPE state. */ |
2082
|
|
|
} |
2083
|
|
|
break; |
2084
|
|
|
|
2085
|
|
|
// case 'cdataSection': |
2086
|
|
|
|
2087
|
|
|
} |
2088
|
|
|
} |
2089
|
|
|
} |
2090
|
|
|
|
2091
|
|
|
/** |
2092
|
|
|
* Returns a serialized representation of the tree. |
2093
|
|
|
*/ |
2094
|
|
|
public function save() { |
2095
|
|
|
return $this->tree->save(); |
2096
|
|
|
} |
2097
|
|
|
|
2098
|
|
|
/** |
2099
|
|
|
* Returns the input stream. |
2100
|
|
|
*/ |
2101
|
|
|
public function stream() { |
2102
|
|
|
return $this->stream; |
2103
|
|
|
} |
2104
|
|
|
|
2105
|
|
|
private function consumeCharacterReference($allowed = false, $inattr = false) { |
2106
|
|
|
// This goes quite far against spec, and is far closer to the Python |
2107
|
|
|
// impl., mainly because we don't do the large unconsuming the spec |
2108
|
|
|
// requires. |
2109
|
|
|
|
2110
|
|
|
// All consumed characters. |
2111
|
|
|
$chars = $this->stream->char(); |
2112
|
|
|
|
2113
|
|
|
/* This section defines how to consume a character |
2114
|
|
|
reference. This definition is used when parsing character |
2115
|
|
|
references in text and in attributes. |
2116
|
|
|
|
2117
|
|
|
The behavior depends on the identity of the next character |
2118
|
|
|
(the one immediately after the U+0026 AMPERSAND character): */ |
2119
|
|
|
|
2120
|
|
|
if ( |
2121
|
|
|
$chars[0] === "\x09" || |
2122
|
|
|
$chars[0] === "\x0A" || |
2123
|
|
|
$chars[0] === "\x0C" || |
2124
|
|
|
$chars[0] === "\x20" || |
2125
|
|
|
$chars[0] === '<' || |
2126
|
|
|
$chars[0] === '&' || |
2127
|
|
|
$chars === false || |
2128
|
|
|
$chars[0] === $allowed |
2129
|
|
|
) { |
2130
|
|
|
/* U+0009 CHARACTER TABULATION |
2131
|
|
|
U+000A LINE FEED (LF) |
2132
|
|
|
U+000C FORM FEED (FF) |
2133
|
|
|
U+0020 SPACE |
2134
|
|
|
U+003C LESS-THAN SIGN |
2135
|
|
|
U+0026 AMPERSAND |
2136
|
|
|
EOF |
2137
|
|
|
The additional allowed character, if there is one |
2138
|
|
|
Not a character reference. No characters are consumed, |
2139
|
|
|
and nothing is returned. (This is not an error, either.) */ |
2140
|
|
|
// We already consumed, so unconsume. |
2141
|
|
|
$this->stream->unget(); |
2142
|
|
|
return '&'; |
2143
|
|
|
} elseif ($chars[0] === '#') { |
2144
|
|
|
/* Consume the U+0023 NUMBER SIGN. */ |
2145
|
|
|
// Um, yeah, we already did that. |
2146
|
|
|
/* The behavior further depends on the character after |
2147
|
|
|
the U+0023 NUMBER SIGN: */ |
2148
|
|
|
$chars .= $this->stream->char(); |
2149
|
|
|
if (isset($chars[1]) && ($chars[1] === 'x' || $chars[1] === 'X')) { |
2150
|
|
|
/* U+0078 LATIN SMALL LETTER X |
2151
|
|
|
U+0058 LATIN CAPITAL LETTER X */ |
2152
|
|
|
/* Consume the X. */ |
2153
|
|
|
// Um, yeah, we already did that. |
2154
|
|
|
/* Follow the steps below, but using the range of |
2155
|
|
|
characters U+0030 DIGIT ZERO through to U+0039 DIGIT |
2156
|
|
|
NINE, U+0061 LATIN SMALL LETTER A through to U+0066 |
2157
|
|
|
LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER |
2158
|
|
|
A, through to U+0046 LATIN CAPITAL LETTER F (in other |
2159
|
|
|
words, 0123456789, ABCDEF, abcdef). */ |
2160
|
|
|
$char_class = self::HEX; |
2161
|
|
|
/* When it comes to interpreting the |
2162
|
|
|
number, interpret it as a hexadecimal number. */ |
2163
|
|
|
$hex = true; |
2164
|
|
|
} else { |
2165
|
|
|
/* Anything else */ |
2166
|
|
|
// Unconsume because we shouldn't have consumed this. |
2167
|
|
|
$chars = $chars[0]; |
2168
|
|
|
$this->stream->unget(); |
2169
|
|
|
/* Follow the steps below, but using the range of |
2170
|
|
|
characters U+0030 DIGIT ZERO through to U+0039 DIGIT |
2171
|
|
|
NINE (i.e. just 0123456789). */ |
2172
|
|
|
$char_class = self::DIGIT; |
2173
|
|
|
/* When it comes to interpreting the number, |
2174
|
|
|
interpret it as a decimal number. */ |
2175
|
|
|
$hex = false; |
2176
|
|
|
} |
2177
|
|
|
|
2178
|
|
|
/* Consume as many characters as match the range of characters given above. */ |
2179
|
|
|
$consumed = $this->stream->charsWhile($char_class); |
2180
|
|
|
if ($consumed === '' || $consumed === false) { |
2181
|
|
|
/* If no characters match the range, then don't consume |
2182
|
|
|
any characters (and unconsume the U+0023 NUMBER SIGN |
2183
|
|
|
character and, if appropriate, the X character). This |
2184
|
|
|
is a parse error; nothing is returned. */ |
2185
|
|
|
$this->emitToken(array( |
2186
|
|
|
'type' => self::PARSEERROR, |
2187
|
|
|
'data' => 'expected-numeric-entity' |
2188
|
|
|
)); |
2189
|
|
|
return '&' . $chars; |
2190
|
|
|
} else { |
2191
|
|
|
/* Otherwise, if the next character is a U+003B SEMICOLON, |
2192
|
|
|
consume that too. If it isn't, there is a parse error. */ |
2193
|
|
|
if ($this->stream->char() !== ';') { |
2194
|
|
|
$this->stream->unget(); |
2195
|
|
|
$this->emitToken(array( |
2196
|
|
|
'type' => self::PARSEERROR, |
2197
|
|
|
'data' => 'numeric-entity-without-semicolon' |
2198
|
|
|
)); |
2199
|
|
|
} |
2200
|
|
|
|
2201
|
|
|
/* If one or more characters match the range, then take |
2202
|
|
|
them all and interpret the string of characters as a number |
2203
|
|
|
(either hexadecimal or decimal as appropriate). */ |
2204
|
|
|
$codepoint = $hex ? hexdec($consumed) : (int) $consumed; |
2205
|
|
|
|
2206
|
|
|
/* If that number is one of the numbers in the first column |
2207
|
|
|
of the following table, then this is a parse error. Find the |
2208
|
|
|
row with that number in the first column, and return a |
2209
|
|
|
character token for the Unicode character given in the |
2210
|
|
|
second column of that row. */ |
2211
|
|
|
$new_codepoint = HTML5_Data::getRealCodepoint($codepoint); |
2212
|
|
|
if ($new_codepoint) { |
2213
|
|
|
$this->emitToken(array( |
2214
|
|
|
'type' => self::PARSEERROR, |
2215
|
|
|
'data' => 'illegal-windows-1252-entity' |
2216
|
|
|
)); |
2217
|
|
|
return HTML5_Data::utf8chr($new_codepoint); |
2218
|
|
|
} else { |
2219
|
|
|
/* Otherwise, if the number is greater than 0x10FFFF, then |
2220
|
|
|
* this is a parse error. Return a U+FFFD REPLACEMENT |
2221
|
|
|
* CHARACTER. */ |
2222
|
|
|
if ($codepoint > 0x10FFFF) { |
2223
|
|
|
$this->emitToken(array( |
2224
|
|
|
'type' => self::PARSEERROR, |
2225
|
|
|
'data' => 'overlong-character-entity' // XXX probably not correct |
2226
|
|
|
)); |
2227
|
|
|
return "\xEF\xBF\xBD"; |
2228
|
|
|
} |
2229
|
|
|
/* Otherwise, return a character token for the Unicode |
2230
|
|
|
* character whose code point is that number. If the |
2231
|
|
|
* number is in the range 0x0001 to 0x0008, 0x000E to |
2232
|
|
|
* 0x001F, 0x007F to 0x009F, 0xD800 to 0xDFFF, 0xFDD0 to |
2233
|
|
|
* 0xFDEF, or is one of 0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, |
2234
|
|
|
* 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, |
2235
|
|
|
* 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, |
2236
|
|
|
* 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, |
2237
|
|
|
* 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, |
2238
|
|
|
* 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, |
2239
|
|
|
* or 0x10FFFF, then this is a parse error. */ |
2240
|
|
|
// && has higher precedence than || |
2241
|
|
|
if ( |
2242
|
|
|
$codepoint >= 0x0000 && $codepoint <= 0x0008 || |
2243
|
|
|
$codepoint === 0x000B || |
2244
|
|
|
$codepoint >= 0x000E && $codepoint <= 0x001F || |
2245
|
|
|
$codepoint >= 0x007F && $codepoint <= 0x009F || |
2246
|
|
|
$codepoint >= 0xD800 && $codepoint <= 0xDFFF || |
2247
|
|
|
$codepoint >= 0xFDD0 && $codepoint <= 0xFDEF || |
2248
|
|
|
($codepoint & 0xFFFE) === 0xFFFE || |
2249
|
|
|
$codepoint == 0x10FFFF || $codepoint == 0x10FFFE |
2250
|
|
|
) { |
2251
|
|
|
$this->emitToken(array( |
2252
|
|
|
'type' => self::PARSEERROR, |
2253
|
|
|
'data' => 'illegal-codepoint-for-numeric-entity' |
2254
|
|
|
)); |
2255
|
|
|
} |
2256
|
|
|
return HTML5_Data::utf8chr($codepoint); |
2257
|
|
|
} |
2258
|
|
|
} |
2259
|
|
|
|
2260
|
|
|
} else { |
2261
|
|
|
/* Anything else */ |
2262
|
|
|
|
2263
|
|
|
/* Consume the maximum number of characters possible, |
2264
|
|
|
with the consumed characters matching one of the |
2265
|
|
|
identifiers in the first column of the named character |
2266
|
|
|
references table (in a case-sensitive manner). */ |
2267
|
|
|
// What we actually do here is consume as much as we can while it |
2268
|
|
|
// matches the start of one of the identifiers in the first column. |
2269
|
|
|
|
2270
|
|
|
$refs = HTML5_Data::getNamedCharacterReferences(); |
2271
|
|
|
|
2272
|
|
|
// Get the longest string which is the start of an identifier |
2273
|
|
|
// ($chars) as well as the longest identifier which matches ($id) |
2274
|
|
|
// and its codepoint ($codepoint). |
2275
|
|
|
$codepoint = false; |
2276
|
|
|
$char = $chars; |
2277
|
|
|
while ($char !== false && isset($refs[$char])) { |
2278
|
|
|
$refs = $refs[$char]; |
2279
|
|
|
if (isset($refs['codepoint'])) { |
2280
|
|
|
$id = $chars; |
2281
|
|
|
$codepoint = $refs['codepoint']; |
2282
|
|
|
} |
2283
|
|
|
$chars .= $char = $this->stream->char(); |
2284
|
|
|
} |
2285
|
|
|
|
2286
|
|
|
// Unconsume the one character we just took which caused the while |
2287
|
|
|
// statement to fail. This could be anything and could cause state |
2288
|
|
|
// changes (as if it matches the while loop it must be |
2289
|
|
|
// alphanumeric so we can just concat it to whatever we get later). |
2290
|
|
|
$this->stream->unget(); |
2291
|
|
|
if ($char !== false) { |
2292
|
|
|
$chars = substr($chars, 0, -1); |
2293
|
|
|
} |
2294
|
|
|
|
2295
|
|
|
/* If no match can be made, then this is a parse error. |
2296
|
|
|
No characters are consumed, and nothing is returned. */ |
2297
|
|
|
if (!$codepoint) { |
2298
|
|
|
$this->emitToken(array( |
2299
|
|
|
'type' => self::PARSEERROR, |
2300
|
|
|
'data' => 'expected-named-entity' |
2301
|
|
|
)); |
2302
|
|
|
return '&' . $chars; |
2303
|
|
|
} |
2304
|
|
|
|
2305
|
|
|
/* If the last character matched is not a U+003B SEMICOLON |
2306
|
|
|
(;), there is a parse error. */ |
2307
|
|
|
$semicolon = true; |
2308
|
|
|
if (substr($id, -1) !== ';') { |
2309
|
|
|
$this->emitToken(array( |
2310
|
|
|
'type' => self::PARSEERROR, |
2311
|
|
|
'data' => 'named-entity-without-semicolon' |
2312
|
|
|
)); |
2313
|
|
|
$semicolon = false; |
2314
|
|
|
} |
2315
|
|
|
|
2316
|
|
|
/* If the character reference is being consumed as part of |
2317
|
|
|
an attribute, and the last character matched is not a |
2318
|
|
|
U+003B SEMICOLON (;), and the next character is in the |
2319
|
|
|
range U+0030 DIGIT ZERO to U+0039 DIGIT NINE, U+0041 |
2320
|
|
|
LATIN CAPITAL LETTER A to U+005A LATIN CAPITAL LETTER Z, |
2321
|
|
|
or U+0061 LATIN SMALL LETTER A to U+007A LATIN SMALL LETTER Z, |
2322
|
|
|
then, for historical reasons, all the characters that were |
2323
|
|
|
matched after the U+0026 AMPERSAND (&) must be unconsumed, |
2324
|
|
|
and nothing is returned. */ |
2325
|
|
|
if ($inattr && !$semicolon) { |
2326
|
|
|
// The next character is either the next character in $chars or in the stream. |
2327
|
|
|
if (strlen($chars) > strlen($id)) { |
2328
|
|
|
$next = substr($chars, strlen($id), 1); |
2329
|
|
|
} else { |
2330
|
|
|
$next = $this->stream->char(); |
2331
|
|
|
$this->stream->unget(); |
2332
|
|
|
} |
2333
|
|
|
if ( |
2334
|
|
|
'0' <= $next && $next <= '9' || |
2335
|
|
|
'A' <= $next && $next <= 'Z' || |
2336
|
|
|
'a' <= $next && $next <= 'z' |
2337
|
|
|
) { |
2338
|
|
|
return '&' . $chars; |
2339
|
|
|
} |
2340
|
|
|
} |
2341
|
|
|
|
2342
|
|
|
/* Otherwise, return a character token for the character |
2343
|
|
|
corresponding to the character reference name (as given |
2344
|
|
|
by the second column of the named character references table). */ |
2345
|
|
|
return HTML5_Data::utf8chr($codepoint) . substr($chars, strlen($id)); |
2346
|
|
|
} |
2347
|
|
|
} |
2348
|
|
|
|
2349
|
|
|
private function characterReferenceInAttributeValue($allowed = false) { |
2350
|
|
|
/* Attempt to consume a character reference. */ |
2351
|
|
|
$entity = $this->consumeCharacterReference($allowed, true); |
2352
|
|
|
|
2353
|
|
|
/* If nothing is returned, append a U+0026 AMPERSAND |
2354
|
|
|
character to the current attribute's value. |
2355
|
|
|
|
2356
|
|
|
Otherwise, append the returned character token to the |
2357
|
|
|
current attribute's value. */ |
2358
|
|
|
$char = (!$entity) |
2359
|
|
|
? '&' |
2360
|
|
|
: $entity; |
2361
|
|
|
|
2362
|
|
|
$last = count($this->token['attr']) - 1; |
2363
|
|
|
$this->token['attr'][$last]['value'] .= $char; |
2364
|
|
|
|
2365
|
|
|
/* Finally, switch back to the attribute value state that you |
2366
|
|
|
were in when were switched into this state. */ |
2367
|
|
|
} |
2368
|
|
|
|
2369
|
|
|
/** |
2370
|
|
|
* Emits a token, passing it on to the tree builder. |
2371
|
|
|
*/ |
2372
|
|
|
protected function emitToken($token, $checkStream = true, $dry = false) { |
2373
|
|
|
if ($checkStream) { |
2374
|
|
|
// Emit errors from input stream. |
2375
|
|
|
while ($this->stream->errors) { |
2376
|
|
|
$this->emitToken(array_shift($this->stream->errors), false); |
2377
|
|
|
} |
2378
|
|
|
} |
2379
|
|
|
if($token['type'] === self::ENDTAG && !empty($token['attr'])) { |
2380
|
|
|
for ($i = 0; $i < count($token['attr']); $i++) { |
2381
|
|
|
$this->emitToken(array( |
2382
|
|
|
'type' => self::PARSEERROR, |
2383
|
|
|
'data' => 'attributes-in-end-tag' |
2384
|
|
|
)); |
2385
|
|
|
} |
2386
|
|
|
} |
2387
|
|
|
if($token['type'] === self::ENDTAG && !empty($token['self-closing'])) { |
2388
|
|
|
$this->emitToken(array( |
2389
|
|
|
'type' => self::PARSEERROR, |
2390
|
|
|
'data' => 'self-closing-flag-on-end-tag', |
2391
|
|
|
)); |
2392
|
|
|
} |
2393
|
|
|
if($token['type'] === self::STARTTAG) { |
2394
|
|
|
// This could be changed to actually pass the tree-builder a hash |
2395
|
|
|
$hash = array(); |
2396
|
|
|
foreach ($token['attr'] as $keypair) { |
2397
|
|
|
if (isset($hash[$keypair['name']])) { |
2398
|
|
|
$this->emitToken(array( |
2399
|
|
|
'type' => self::PARSEERROR, |
2400
|
|
|
'data' => 'duplicate-attribute', |
2401
|
|
|
)); |
2402
|
|
|
} else { |
2403
|
|
|
$hash[$keypair['name']] = $keypair['value']; |
2404
|
|
|
} |
2405
|
|
|
} |
2406
|
|
|
} |
2407
|
|
|
|
2408
|
|
|
if(!$dry) { |
2409
|
|
|
// the current structure of attributes is not a terribly good one |
2410
|
|
|
$this->tree->emitToken($token); |
2411
|
|
|
} |
2412
|
|
|
|
2413
|
|
|
if(!$dry && is_int($this->tree->content_model)) { |
2414
|
|
|
$this->content_model = $this->tree->content_model; |
2415
|
|
|
$this->tree->content_model = null; |
2416
|
|
|
|
2417
|
|
|
} elseif($token['type'] === self::ENDTAG) { |
2418
|
|
|
$this->content_model = self::PCDATA; |
2419
|
|
|
} |
2420
|
|
|
} |
2421
|
|
|
} |
2422
|
|
|
|
2423
|
|
|
|