1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/** |
4
|
|
|
* Experimental HTML5-based parser using Jeroen van der Meer's PH5P library. |
5
|
|
|
* Occupies space in the HTML5 pseudo-namespace, which may cause conflicts. |
6
|
|
|
* |
7
|
|
|
* @note |
8
|
|
|
* Recent changes to PHP's DOM extension have resulted in some fatal |
9
|
|
|
* error conditions with the original version of PH5P. Pending changes, |
10
|
|
|
* this lexer will punt to DirectLex if DOM throws an exception. |
11
|
|
|
*/ |
12
|
|
|
|
13
|
|
|
class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex |
14
|
|
|
{ |
15
|
|
|
/** |
16
|
|
|
* @param string $html |
17
|
|
|
* @param HTMLPurifier_Config $config |
18
|
|
|
* @param HTMLPurifier_Context $context |
19
|
|
|
* @return HTMLPurifier_Token[] |
20
|
|
|
*/ |
21
|
|
|
public function tokenizeHTML($html, $config, $context) |
22
|
|
|
{ |
23
|
|
|
$new_html = $this->normalize($html, $config, $context); |
24
|
|
|
$new_html = $this->wrapHTML($new_html, $config, $context, false /* no div */); |
25
|
|
|
try { |
26
|
|
|
$parser = new HTML5($new_html); |
27
|
|
|
$doc = $parser->save(); |
28
|
|
|
} catch (DOMException $e) { |
29
|
|
|
// Uh oh, it failed. Punt to DirectLex. |
30
|
|
|
$lexer = new HTMLPurifier_Lexer_DirectLex(); |
31
|
|
|
$context->register('PH5PError', $e); // save the error, so we can detect it |
32
|
|
|
return $lexer->tokenizeHTML($html, $config, $context); // use original HTML |
33
|
|
|
} |
34
|
|
|
$tokens = array(); |
35
|
|
|
$this->tokenizeDOM( |
36
|
|
|
$doc->getElementsByTagName('html')->item(0)-> // <html> |
37
|
|
|
getElementsByTagName('body')->item(0) // <body> |
38
|
|
|
, |
39
|
|
|
$tokens, $config |
40
|
|
|
); |
41
|
|
|
return $tokens; |
42
|
|
|
} |
43
|
|
|
} |
44
|
|
|
|
45
|
|
|
/* |
46
|
|
|
|
47
|
|
|
Copyright 2007 Jeroen van der Meer <http://jero.net/> |
48
|
|
|
|
49
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a |
50
|
|
|
copy of this software and associated documentation files (the |
51
|
|
|
"Software"), to deal in the Software without restriction, including |
52
|
|
|
without limitation the rights to use, copy, modify, merge, publish, |
53
|
|
|
distribute, sublicense, and/or sell copies of the Software, and to |
54
|
|
|
permit persons to whom the Software is furnished to do so, subject to |
55
|
|
|
the following conditions: |
56
|
|
|
|
57
|
|
|
The above copyright notice and this permission notice shall be included |
58
|
|
|
in all copies or substantial portions of the Software. |
59
|
|
|
|
60
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
61
|
|
|
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
62
|
|
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
63
|
|
|
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
64
|
|
|
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
65
|
|
|
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
66
|
|
|
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
67
|
|
|
|
68
|
|
|
*/ |
69
|
|
|
|
70
|
|
|
class HTML5 |
71
|
|
|
{ |
72
|
|
|
private $data; |
73
|
|
|
private $char; |
74
|
|
|
private $EOF; |
75
|
|
|
private $state; |
76
|
|
|
private $tree; |
77
|
|
|
private $token; |
78
|
|
|
private $content_model; |
79
|
|
|
private $escape = false; |
80
|
|
|
private $entities = array( |
81
|
|
|
'AElig;', |
82
|
|
|
'AElig', |
83
|
|
|
'AMP;', |
84
|
|
|
'AMP', |
85
|
|
|
'Aacute;', |
86
|
|
|
'Aacute', |
87
|
|
|
'Acirc;', |
88
|
|
|
'Acirc', |
89
|
|
|
'Agrave;', |
90
|
|
|
'Agrave', |
91
|
|
|
'Alpha;', |
92
|
|
|
'Aring;', |
93
|
|
|
'Aring', |
94
|
|
|
'Atilde;', |
95
|
|
|
'Atilde', |
96
|
|
|
'Auml;', |
97
|
|
|
'Auml', |
98
|
|
|
'Beta;', |
99
|
|
|
'COPY;', |
100
|
|
|
'COPY', |
101
|
|
|
'Ccedil;', |
102
|
|
|
'Ccedil', |
103
|
|
|
'Chi;', |
104
|
|
|
'Dagger;', |
105
|
|
|
'Delta;', |
106
|
|
|
'ETH;', |
107
|
|
|
'ETH', |
108
|
|
|
'Eacute;', |
109
|
|
|
'Eacute', |
110
|
|
|
'Ecirc;', |
111
|
|
|
'Ecirc', |
112
|
|
|
'Egrave;', |
113
|
|
|
'Egrave', |
114
|
|
|
'Epsilon;', |
115
|
|
|
'Eta;', |
116
|
|
|
'Euml;', |
117
|
|
|
'Euml', |
118
|
|
|
'GT;', |
119
|
|
|
'GT', |
120
|
|
|
'Gamma;', |
121
|
|
|
'Iacute;', |
122
|
|
|
'Iacute', |
123
|
|
|
'Icirc;', |
124
|
|
|
'Icirc', |
125
|
|
|
'Igrave;', |
126
|
|
|
'Igrave', |
127
|
|
|
'Iota;', |
128
|
|
|
'Iuml;', |
129
|
|
|
'Iuml', |
130
|
|
|
'Kappa;', |
131
|
|
|
'LT;', |
132
|
|
|
'LT', |
133
|
|
|
'Lambda;', |
134
|
|
|
'Mu;', |
135
|
|
|
'Ntilde;', |
136
|
|
|
'Ntilde', |
137
|
|
|
'Nu;', |
138
|
|
|
'OElig;', |
139
|
|
|
'Oacute;', |
140
|
|
|
'Oacute', |
141
|
|
|
'Ocirc;', |
142
|
|
|
'Ocirc', |
143
|
|
|
'Ograve;', |
144
|
|
|
'Ograve', |
145
|
|
|
'Omega;', |
146
|
|
|
'Omicron;', |
147
|
|
|
'Oslash;', |
148
|
|
|
'Oslash', |
149
|
|
|
'Otilde;', |
150
|
|
|
'Otilde', |
151
|
|
|
'Ouml;', |
152
|
|
|
'Ouml', |
153
|
|
|
'Phi;', |
154
|
|
|
'Pi;', |
155
|
|
|
'Prime;', |
156
|
|
|
'Psi;', |
157
|
|
|
'QUOT;', |
158
|
|
|
'QUOT', |
159
|
|
|
'REG;', |
160
|
|
|
'REG', |
161
|
|
|
'Rho;', |
162
|
|
|
'Scaron;', |
163
|
|
|
'Sigma;', |
164
|
|
|
'THORN;', |
165
|
|
|
'THORN', |
166
|
|
|
'TRADE;', |
167
|
|
|
'Tau;', |
168
|
|
|
'Theta;', |
169
|
|
|
'Uacute;', |
170
|
|
|
'Uacute', |
171
|
|
|
'Ucirc;', |
172
|
|
|
'Ucirc', |
173
|
|
|
'Ugrave;', |
174
|
|
|
'Ugrave', |
175
|
|
|
'Upsilon;', |
176
|
|
|
'Uuml;', |
177
|
|
|
'Uuml', |
178
|
|
|
'Xi;', |
179
|
|
|
'Yacute;', |
180
|
|
|
'Yacute', |
181
|
|
|
'Yuml;', |
182
|
|
|
'Zeta;', |
183
|
|
|
'aacute;', |
184
|
|
|
'aacute', |
185
|
|
|
'acirc;', |
186
|
|
|
'acirc', |
187
|
|
|
'acute;', |
188
|
|
|
'acute', |
189
|
|
|
'aelig;', |
190
|
|
|
'aelig', |
191
|
|
|
'agrave;', |
192
|
|
|
'agrave', |
193
|
|
|
'alefsym;', |
194
|
|
|
'alpha;', |
195
|
|
|
'amp;', |
196
|
|
|
'amp', |
197
|
|
|
'and;', |
198
|
|
|
'ang;', |
199
|
|
|
'apos;', |
200
|
|
|
'aring;', |
201
|
|
|
'aring', |
202
|
|
|
'asymp;', |
203
|
|
|
'atilde;', |
204
|
|
|
'atilde', |
205
|
|
|
'auml;', |
206
|
|
|
'auml', |
207
|
|
|
'bdquo;', |
208
|
|
|
'beta;', |
209
|
|
|
'brvbar;', |
210
|
|
|
'brvbar', |
211
|
|
|
'bull;', |
212
|
|
|
'cap;', |
213
|
|
|
'ccedil;', |
214
|
|
|
'ccedil', |
215
|
|
|
'cedil;', |
216
|
|
|
'cedil', |
217
|
|
|
'cent;', |
218
|
|
|
'cent', |
219
|
|
|
'chi;', |
220
|
|
|
'circ;', |
221
|
|
|
'clubs;', |
222
|
|
|
'cong;', |
223
|
|
|
'copy;', |
224
|
|
|
'copy', |
225
|
|
|
'crarr;', |
226
|
|
|
'cup;', |
227
|
|
|
'curren;', |
228
|
|
|
'curren', |
229
|
|
|
'dArr;', |
230
|
|
|
'dagger;', |
231
|
|
|
'darr;', |
232
|
|
|
'deg;', |
233
|
|
|
'deg', |
234
|
|
|
'delta;', |
235
|
|
|
'diams;', |
236
|
|
|
'divide;', |
237
|
|
|
'divide', |
238
|
|
|
'eacute;', |
239
|
|
|
'eacute', |
240
|
|
|
'ecirc;', |
241
|
|
|
'ecirc', |
242
|
|
|
'egrave;', |
243
|
|
|
'egrave', |
244
|
|
|
'empty;', |
245
|
|
|
'emsp;', |
246
|
|
|
'ensp;', |
247
|
|
|
'epsilon;', |
248
|
|
|
'equiv;', |
249
|
|
|
'eta;', |
250
|
|
|
'eth;', |
251
|
|
|
'eth', |
252
|
|
|
'euml;', |
253
|
|
|
'euml', |
254
|
|
|
'euro;', |
255
|
|
|
'exist;', |
256
|
|
|
'fnof;', |
257
|
|
|
'forall;', |
258
|
|
|
'frac12;', |
259
|
|
|
'frac12', |
260
|
|
|
'frac14;', |
261
|
|
|
'frac14', |
262
|
|
|
'frac34;', |
263
|
|
|
'frac34', |
264
|
|
|
'frasl;', |
265
|
|
|
'gamma;', |
266
|
|
|
'ge;', |
267
|
|
|
'gt;', |
268
|
|
|
'gt', |
269
|
|
|
'hArr;', |
270
|
|
|
'harr;', |
271
|
|
|
'hearts;', |
272
|
|
|
'hellip;', |
273
|
|
|
'iacute;', |
274
|
|
|
'iacute', |
275
|
|
|
'icirc;', |
276
|
|
|
'icirc', |
277
|
|
|
'iexcl;', |
278
|
|
|
'iexcl', |
279
|
|
|
'igrave;', |
280
|
|
|
'igrave', |
281
|
|
|
'image;', |
282
|
|
|
'infin;', |
283
|
|
|
'int;', |
284
|
|
|
'iota;', |
285
|
|
|
'iquest;', |
286
|
|
|
'iquest', |
287
|
|
|
'isin;', |
288
|
|
|
'iuml;', |
289
|
|
|
'iuml', |
290
|
|
|
'kappa;', |
291
|
|
|
'lArr;', |
292
|
|
|
'lambda;', |
293
|
|
|
'lang;', |
294
|
|
|
'laquo;', |
295
|
|
|
'laquo', |
296
|
|
|
'larr;', |
297
|
|
|
'lceil;', |
298
|
|
|
'ldquo;', |
299
|
|
|
'le;', |
300
|
|
|
'lfloor;', |
301
|
|
|
'lowast;', |
302
|
|
|
'loz;', |
303
|
|
|
'lrm;', |
304
|
|
|
'lsaquo;', |
305
|
|
|
'lsquo;', |
306
|
|
|
'lt;', |
307
|
|
|
'lt', |
308
|
|
|
'macr;', |
309
|
|
|
'macr', |
310
|
|
|
'mdash;', |
311
|
|
|
'micro;', |
312
|
|
|
'micro', |
313
|
|
|
'middot;', |
314
|
|
|
'middot', |
315
|
|
|
'minus;', |
316
|
|
|
'mu;', |
317
|
|
|
'nabla;', |
318
|
|
|
'nbsp;', |
319
|
|
|
'nbsp', |
320
|
|
|
'ndash;', |
321
|
|
|
'ne;', |
322
|
|
|
'ni;', |
323
|
|
|
'not;', |
324
|
|
|
'not', |
325
|
|
|
'notin;', |
326
|
|
|
'nsub;', |
327
|
|
|
'ntilde;', |
328
|
|
|
'ntilde', |
329
|
|
|
'nu;', |
330
|
|
|
'oacute;', |
331
|
|
|
'oacute', |
332
|
|
|
'ocirc;', |
333
|
|
|
'ocirc', |
334
|
|
|
'oelig;', |
335
|
|
|
'ograve;', |
336
|
|
|
'ograve', |
337
|
|
|
'oline;', |
338
|
|
|
'omega;', |
339
|
|
|
'omicron;', |
340
|
|
|
'oplus;', |
341
|
|
|
'or;', |
342
|
|
|
'ordf;', |
343
|
|
|
'ordf', |
344
|
|
|
'ordm;', |
345
|
|
|
'ordm', |
346
|
|
|
'oslash;', |
347
|
|
|
'oslash', |
348
|
|
|
'otilde;', |
349
|
|
|
'otilde', |
350
|
|
|
'otimes;', |
351
|
|
|
'ouml;', |
352
|
|
|
'ouml', |
353
|
|
|
'para;', |
354
|
|
|
'para', |
355
|
|
|
'part;', |
356
|
|
|
'permil;', |
357
|
|
|
'perp;', |
358
|
|
|
'phi;', |
359
|
|
|
'pi;', |
360
|
|
|
'piv;', |
361
|
|
|
'plusmn;', |
362
|
|
|
'plusmn', |
363
|
|
|
'pound;', |
364
|
|
|
'pound', |
365
|
|
|
'prime;', |
366
|
|
|
'prod;', |
367
|
|
|
'prop;', |
368
|
|
|
'psi;', |
369
|
|
|
'quot;', |
370
|
|
|
'quot', |
371
|
|
|
'rArr;', |
372
|
|
|
'radic;', |
373
|
|
|
'rang;', |
374
|
|
|
'raquo;', |
375
|
|
|
'raquo', |
376
|
|
|
'rarr;', |
377
|
|
|
'rceil;', |
378
|
|
|
'rdquo;', |
379
|
|
|
'real;', |
380
|
|
|
'reg;', |
381
|
|
|
'reg', |
382
|
|
|
'rfloor;', |
383
|
|
|
'rho;', |
384
|
|
|
'rlm;', |
385
|
|
|
'rsaquo;', |
386
|
|
|
'rsquo;', |
387
|
|
|
'sbquo;', |
388
|
|
|
'scaron;', |
389
|
|
|
'sdot;', |
390
|
|
|
'sect;', |
391
|
|
|
'sect', |
392
|
|
|
'shy;', |
393
|
|
|
'shy', |
394
|
|
|
'sigma;', |
395
|
|
|
'sigmaf;', |
396
|
|
|
'sim;', |
397
|
|
|
'spades;', |
398
|
|
|
'sub;', |
399
|
|
|
'sube;', |
400
|
|
|
'sum;', |
401
|
|
|
'sup1;', |
402
|
|
|
'sup1', |
403
|
|
|
'sup2;', |
404
|
|
|
'sup2', |
405
|
|
|
'sup3;', |
406
|
|
|
'sup3', |
407
|
|
|
'sup;', |
408
|
|
|
'supe;', |
409
|
|
|
'szlig;', |
410
|
|
|
'szlig', |
411
|
|
|
'tau;', |
412
|
|
|
'there4;', |
413
|
|
|
'theta;', |
414
|
|
|
'thetasym;', |
415
|
|
|
'thinsp;', |
416
|
|
|
'thorn;', |
417
|
|
|
'thorn', |
418
|
|
|
'tilde;', |
419
|
|
|
'times;', |
420
|
|
|
'times', |
421
|
|
|
'trade;', |
422
|
|
|
'uArr;', |
423
|
|
|
'uacute;', |
424
|
|
|
'uacute', |
425
|
|
|
'uarr;', |
426
|
|
|
'ucirc;', |
427
|
|
|
'ucirc', |
428
|
|
|
'ugrave;', |
429
|
|
|
'ugrave', |
430
|
|
|
'uml;', |
431
|
|
|
'uml', |
432
|
|
|
'upsih;', |
433
|
|
|
'upsilon;', |
434
|
|
|
'uuml;', |
435
|
|
|
'uuml', |
436
|
|
|
'weierp;', |
437
|
|
|
'xi;', |
438
|
|
|
'yacute;', |
439
|
|
|
'yacute', |
440
|
|
|
'yen;', |
441
|
|
|
'yen', |
442
|
|
|
'yuml;', |
443
|
|
|
'yuml', |
444
|
|
|
'zeta;', |
445
|
|
|
'zwj;', |
446
|
|
|
'zwnj;' |
447
|
|
|
); |
448
|
|
|
|
449
|
|
|
const PCDATA = 0; |
450
|
|
|
const RCDATA = 1; |
451
|
|
|
const CDATA = 2; |
452
|
|
|
const PLAINTEXT = 3; |
453
|
|
|
|
454
|
|
|
const DOCTYPE = 0; |
455
|
|
|
const STARTTAG = 1; |
456
|
|
|
const ENDTAG = 2; |
457
|
|
|
const COMMENT = 3; |
458
|
|
|
const CHARACTR = 4; |
459
|
|
|
const EOF = 5; |
460
|
|
|
|
461
|
|
|
public function __construct($data) |
462
|
|
|
{ |
463
|
|
|
$this->data = $data; |
464
|
|
|
$this->char = -1; |
465
|
|
|
$this->EOF = strlen($data); |
466
|
|
|
$this->tree = new HTML5TreeConstructer; |
467
|
|
|
$this->content_model = self::PCDATA; |
468
|
|
|
|
469
|
|
|
$this->state = 'data'; |
470
|
|
|
|
471
|
|
|
while ($this->state !== null) { |
472
|
|
|
$this->{$this->state . 'State'}(); |
473
|
|
|
} |
474
|
|
|
} |
475
|
|
|
|
476
|
|
|
public function save() |
477
|
|
|
{ |
478
|
|
|
return $this->tree->save(); |
479
|
|
|
} |
480
|
|
|
|
481
|
|
|
private function char() |
482
|
|
|
{ |
483
|
|
|
return ($this->char < $this->EOF) |
484
|
|
|
? $this->data[$this->char] |
485
|
|
|
: false; |
486
|
|
|
} |
487
|
|
|
|
488
|
|
|
private function character($s, $l = 0) |
489
|
|
|
{ |
490
|
|
|
if ($s + $l < $this->EOF) { |
491
|
|
|
if ($l === 0) { |
492
|
|
|
return $this->data[$s]; |
493
|
|
|
} else { |
494
|
|
|
return substr($this->data, $s, $l); |
495
|
|
|
} |
496
|
|
|
} |
497
|
|
|
} |
498
|
|
|
|
499
|
|
|
private function characters($char_class, $start) |
500
|
|
|
{ |
501
|
|
|
return preg_replace('#^([' . $char_class . ']+).*#s', '\\1', substr($this->data, $start)); |
502
|
|
|
} |
503
|
|
|
|
504
|
|
|
private function dataState() |
505
|
|
|
{ |
506
|
|
|
// Consume the next input character |
507
|
|
|
$this->char++; |
508
|
|
|
$char = $this->char(); |
509
|
|
|
|
510
|
|
|
if ($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) { |
511
|
|
|
/* U+0026 AMPERSAND (&) |
512
|
|
|
When the content model flag is set to one of the PCDATA or RCDATA |
513
|
|
|
states: switch to the entity data state. Otherwise: treat it as per |
514
|
|
|
the "anything else" entry below. */ |
515
|
|
|
$this->state = 'entityData'; |
516
|
|
|
|
517
|
|
|
} elseif ($char === '-') { |
518
|
|
|
/* If the content model flag is set to either the RCDATA state or |
519
|
|
|
the CDATA state, and the escape flag is false, and there are at |
520
|
|
|
least three characters before this one in the input stream, and the |
521
|
|
|
last four characters in the input stream, including this one, are |
522
|
|
|
U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS, |
523
|
|
|
and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */ |
524
|
|
|
if (($this->content_model === self::RCDATA || $this->content_model === |
525
|
|
|
self::CDATA) && $this->escape === false && |
526
|
|
|
$this->char >= 3 && $this->character($this->char - 4, 4) === '<!--' |
527
|
|
|
) { |
528
|
|
|
$this->escape = true; |
529
|
|
|
} |
530
|
|
|
|
531
|
|
|
/* In any case, emit the input character as a character token. Stay |
532
|
|
|
in the data state. */ |
533
|
|
|
$this->emitToken( |
534
|
|
|
array( |
535
|
|
|
'type' => self::CHARACTR, |
536
|
|
|
'data' => $char |
537
|
|
|
) |
538
|
|
|
); |
539
|
|
|
|
540
|
|
|
/* U+003C LESS-THAN SIGN (<) */ |
541
|
|
|
} elseif ($char === '<' && ($this->content_model === self::PCDATA || |
542
|
|
|
(($this->content_model === self::RCDATA || |
543
|
|
|
$this->content_model === self::CDATA) && $this->escape === false)) |
544
|
|
|
) { |
545
|
|
|
/* When the content model flag is set to the PCDATA state: switch |
546
|
|
|
to the tag open state. |
547
|
|
|
|
548
|
|
|
When the content model flag is set to either the RCDATA state or |
549
|
|
|
the CDATA state and the escape flag is false: switch to the tag |
550
|
|
|
open state. |
551
|
|
|
|
552
|
|
|
Otherwise: treat it as per the "anything else" entry below. */ |
553
|
|
|
$this->state = 'tagOpen'; |
554
|
|
|
|
555
|
|
|
/* U+003E GREATER-THAN SIGN (>) */ |
556
|
|
|
} elseif ($char === '>') { |
557
|
|
|
/* If the content model flag is set to either the RCDATA state or |
558
|
|
|
the CDATA state, and the escape flag is true, and the last three |
559
|
|
|
characters in the input stream including this one are U+002D |
560
|
|
|
HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"), |
561
|
|
|
set the escape flag to false. */ |
562
|
|
|
if (($this->content_model === self::RCDATA || |
563
|
|
|
$this->content_model === self::CDATA) && $this->escape === true && |
564
|
|
|
$this->character($this->char, 3) === '-->' |
565
|
|
|
) { |
566
|
|
|
$this->escape = false; |
567
|
|
|
} |
568
|
|
|
|
569
|
|
|
/* In any case, emit the input character as a character token. |
570
|
|
|
Stay in the data state. */ |
571
|
|
|
$this->emitToken( |
572
|
|
|
array( |
573
|
|
|
'type' => self::CHARACTR, |
574
|
|
|
'data' => $char |
575
|
|
|
) |
576
|
|
|
); |
577
|
|
|
|
578
|
|
|
} elseif ($this->char === $this->EOF) { |
579
|
|
|
/* EOF |
580
|
|
|
Emit an end-of-file token. */ |
581
|
|
|
$this->EOF(); |
582
|
|
|
|
583
|
|
|
} elseif ($this->content_model === self::PLAINTEXT) { |
584
|
|
|
/* When the content model flag is set to the PLAINTEXT state |
585
|
|
|
THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of |
586
|
|
|
the text and emit it as a character token. */ |
587
|
|
|
$this->emitToken( |
588
|
|
|
array( |
589
|
|
|
'type' => self::CHARACTR, |
590
|
|
|
'data' => substr($this->data, $this->char) |
591
|
|
|
) |
592
|
|
|
); |
593
|
|
|
|
594
|
|
|
$this->EOF(); |
595
|
|
|
|
596
|
|
|
} else { |
597
|
|
|
/* Anything else |
598
|
|
|
THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that |
599
|
|
|
otherwise would also be treated as a character token and emit it |
600
|
|
|
as a single character token. Stay in the data state. */ |
601
|
|
|
$len = strcspn($this->data, '<&', $this->char); |
602
|
|
|
$char = substr($this->data, $this->char, $len); |
603
|
|
|
$this->char += $len - 1; |
604
|
|
|
|
605
|
|
|
$this->emitToken( |
606
|
|
|
array( |
607
|
|
|
'type' => self::CHARACTR, |
608
|
|
|
'data' => $char |
609
|
|
|
) |
610
|
|
|
); |
611
|
|
|
|
612
|
|
|
$this->state = 'data'; |
613
|
|
|
} |
614
|
|
|
} |
615
|
|
|
|
616
|
|
|
private function entityDataState() |
617
|
|
|
{ |
618
|
|
|
// Attempt to consume an entity. |
619
|
|
|
$entity = $this->entity(); |
620
|
|
|
|
621
|
|
|
// If nothing is returned, emit a U+0026 AMPERSAND character token. |
622
|
|
|
// Otherwise, emit the character token that was returned. |
623
|
|
|
$char = (!$entity) ? '&' : $entity; |
624
|
|
|
$this->emitToken( |
625
|
|
|
array( |
626
|
|
|
'type' => self::CHARACTR, |
627
|
|
|
'data' => $char |
628
|
|
|
) |
629
|
|
|
); |
630
|
|
|
|
631
|
|
|
// Finally, switch to the data state. |
632
|
|
|
$this->state = 'data'; |
633
|
|
|
} |
634
|
|
|
|
635
|
|
|
private function tagOpenState() |
636
|
|
|
{ |
637
|
|
|
switch ($this->content_model) { |
638
|
|
|
case self::RCDATA: |
639
|
|
|
case self::CDATA: |
640
|
|
|
/* If the next input character is a U+002F SOLIDUS (/) character, |
641
|
|
|
consume it and switch to the close tag open state. If the next |
642
|
|
|
input character is not a U+002F SOLIDUS (/) character, emit a |
643
|
|
|
U+003C LESS-THAN SIGN character token and switch to the data |
644
|
|
|
state to process the next input character. */ |
645
|
|
|
if ($this->character($this->char + 1) === '/') { |
646
|
|
|
$this->char++; |
647
|
|
|
$this->state = 'closeTagOpen'; |
648
|
|
|
|
649
|
|
|
} else { |
650
|
|
|
$this->emitToken( |
651
|
|
|
array( |
652
|
|
|
'type' => self::CHARACTR, |
653
|
|
|
'data' => '<' |
654
|
|
|
) |
655
|
|
|
); |
656
|
|
|
|
657
|
|
|
$this->state = 'data'; |
658
|
|
|
} |
659
|
|
|
break; |
660
|
|
|
|
661
|
|
|
case self::PCDATA: |
662
|
|
|
// If the content model flag is set to the PCDATA state |
663
|
|
|
// Consume the next input character: |
664
|
|
|
$this->char++; |
665
|
|
|
$char = $this->char(); |
666
|
|
|
|
667
|
|
|
if ($char === '!') { |
668
|
|
|
/* U+0021 EXCLAMATION MARK (!) |
669
|
|
|
Switch to the markup declaration open state. */ |
670
|
|
|
$this->state = 'markupDeclarationOpen'; |
671
|
|
|
|
672
|
|
|
} elseif ($char === '/') { |
673
|
|
|
/* U+002F SOLIDUS (/) |
674
|
|
|
Switch to the close tag open state. */ |
675
|
|
|
$this->state = 'closeTagOpen'; |
676
|
|
|
|
677
|
|
|
} elseif (preg_match('/^[A-Za-z]$/', $char)) { |
|
|
|
|
678
|
|
|
/* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z |
679
|
|
|
Create a new start tag token, set its tag name to the lowercase |
680
|
|
|
version of the input character (add 0x0020 to the character's code |
681
|
|
|
point), then switch to the tag name state. (Don't emit the token |
682
|
|
|
yet; further details will be filled in before it is emitted.) */ |
683
|
|
|
$this->token = array( |
684
|
|
|
'name' => strtolower($char), |
|
|
|
|
685
|
|
|
'type' => self::STARTTAG, |
686
|
|
|
'attr' => array() |
687
|
|
|
); |
688
|
|
|
|
689
|
|
|
$this->state = 'tagName'; |
690
|
|
|
|
691
|
|
|
} elseif ($char === '>') { |
692
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
693
|
|
|
Parse error. Emit a U+003C LESS-THAN SIGN character token and a |
694
|
|
|
U+003E GREATER-THAN SIGN character token. Switch to the data state. */ |
695
|
|
|
$this->emitToken( |
696
|
|
|
array( |
697
|
|
|
'type' => self::CHARACTR, |
698
|
|
|
'data' => '<>' |
699
|
|
|
) |
700
|
|
|
); |
701
|
|
|
|
702
|
|
|
$this->state = 'data'; |
703
|
|
|
|
704
|
|
|
} elseif ($char === '?') { |
705
|
|
|
/* U+003F QUESTION MARK (?) |
706
|
|
|
Parse error. Switch to the bogus comment state. */ |
707
|
|
|
$this->state = 'bogusComment'; |
708
|
|
|
|
709
|
|
|
} else { |
710
|
|
|
/* Anything else |
711
|
|
|
Parse error. Emit a U+003C LESS-THAN SIGN character token and |
712
|
|
|
reconsume the current input character in the data state. */ |
713
|
|
|
$this->emitToken( |
714
|
|
|
array( |
715
|
|
|
'type' => self::CHARACTR, |
716
|
|
|
'data' => '<' |
717
|
|
|
) |
718
|
|
|
); |
719
|
|
|
|
720
|
|
|
$this->char--; |
721
|
|
|
$this->state = 'data'; |
722
|
|
|
} |
723
|
|
|
break; |
724
|
|
|
} |
725
|
|
|
} |
726
|
|
|
|
727
|
|
|
private function closeTagOpenState() |
728
|
|
|
{ |
729
|
|
|
$next_node = strtolower($this->characters('A-Za-z', $this->char + 1)); |
730
|
|
|
$the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName; |
731
|
|
|
|
732
|
|
|
if (($this->content_model === self::RCDATA || $this->content_model === self::CDATA) && |
733
|
|
|
(!$the_same || ($the_same && (!preg_match( |
734
|
|
|
'/[\t\n\x0b\x0c >\/]/', |
735
|
|
|
$this->character($this->char + 1 + strlen($next_node)) |
|
|
|
|
736
|
|
|
) || $this->EOF === $this->char))) |
737
|
|
|
) { |
738
|
|
|
/* If the content model flag is set to the RCDATA or CDATA states then |
739
|
|
|
examine the next few characters. If they do not match the tag name of |
740
|
|
|
the last start tag token emitted (case insensitively), or if they do but |
741
|
|
|
they are not immediately followed by one of the following characters: |
742
|
|
|
* U+0009 CHARACTER TABULATION |
743
|
|
|
* U+000A LINE FEED (LF) |
744
|
|
|
* U+000B LINE TABULATION |
745
|
|
|
* U+000C FORM FEED (FF) |
746
|
|
|
* U+0020 SPACE |
747
|
|
|
* U+003E GREATER-THAN SIGN (>) |
748
|
|
|
* U+002F SOLIDUS (/) |
749
|
|
|
* EOF |
750
|
|
|
...then there is a parse error. Emit a U+003C LESS-THAN SIGN character |
751
|
|
|
token, a U+002F SOLIDUS character token, and switch to the data state |
752
|
|
|
to process the next input character. */ |
753
|
|
|
$this->emitToken( |
754
|
|
|
array( |
755
|
|
|
'type' => self::CHARACTR, |
756
|
|
|
'data' => '</' |
757
|
|
|
) |
758
|
|
|
); |
759
|
|
|
|
760
|
|
|
$this->state = 'data'; |
761
|
|
|
|
762
|
|
|
} else { |
763
|
|
|
/* Otherwise, if the content model flag is set to the PCDATA state, |
764
|
|
|
or if the next few characters do match that tag name, consume the |
765
|
|
|
next input character: */ |
766
|
|
|
$this->char++; |
767
|
|
|
$char = $this->char(); |
768
|
|
|
|
769
|
|
|
if (preg_match('/^[A-Za-z]$/', $char)) { |
|
|
|
|
770
|
|
|
/* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z |
771
|
|
|
Create a new end tag token, set its tag name to the lowercase version |
772
|
|
|
of the input character (add 0x0020 to the character's code point), then |
773
|
|
|
switch to the tag name state. (Don't emit the token yet; further details |
774
|
|
|
will be filled in before it is emitted.) */ |
775
|
|
|
$this->token = array( |
776
|
|
|
'name' => strtolower($char), |
|
|
|
|
777
|
|
|
'type' => self::ENDTAG |
778
|
|
|
); |
779
|
|
|
|
780
|
|
|
$this->state = 'tagName'; |
781
|
|
|
|
782
|
|
|
} elseif ($char === '>') { |
783
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
784
|
|
|
Parse error. Switch to the data state. */ |
785
|
|
|
$this->state = 'data'; |
786
|
|
|
|
787
|
|
|
} elseif ($this->char === $this->EOF) { |
788
|
|
|
/* EOF |
789
|
|
|
Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F |
790
|
|
|
SOLIDUS character token. Reconsume the EOF character in the data state. */ |
791
|
|
|
$this->emitToken( |
792
|
|
|
array( |
793
|
|
|
'type' => self::CHARACTR, |
794
|
|
|
'data' => '</' |
795
|
|
|
) |
796
|
|
|
); |
797
|
|
|
|
798
|
|
|
$this->char--; |
799
|
|
|
$this->state = 'data'; |
800
|
|
|
|
801
|
|
|
} else { |
802
|
|
|
/* Parse error. Switch to the bogus comment state. */ |
803
|
|
|
$this->state = 'bogusComment'; |
804
|
|
|
} |
805
|
|
|
} |
806
|
|
|
} |
807
|
|
|
|
808
|
|
|
private function tagNameState() |
809
|
|
|
{ |
810
|
|
|
// Consume the next input character: |
811
|
|
|
$this->char++; |
812
|
|
|
$char = $this->character($this->char); |
813
|
|
|
|
814
|
|
|
if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { |
|
|
|
|
815
|
|
|
/* U+0009 CHARACTER TABULATION |
816
|
|
|
U+000A LINE FEED (LF) |
817
|
|
|
U+000B LINE TABULATION |
818
|
|
|
U+000C FORM FEED (FF) |
819
|
|
|
U+0020 SPACE |
820
|
|
|
Switch to the before attribute name state. */ |
821
|
|
|
$this->state = 'beforeAttributeName'; |
822
|
|
|
|
823
|
|
|
} elseif ($char === '>') { |
824
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
825
|
|
|
Emit the current tag token. Switch to the data state. */ |
826
|
|
|
$this->emitToken($this->token); |
827
|
|
|
$this->state = 'data'; |
828
|
|
|
|
829
|
|
|
} elseif ($this->char === $this->EOF) { |
830
|
|
|
/* EOF |
831
|
|
|
Parse error. Emit the current tag token. Reconsume the EOF |
832
|
|
|
character in the data state. */ |
833
|
|
|
$this->emitToken($this->token); |
834
|
|
|
|
835
|
|
|
$this->char--; |
836
|
|
|
$this->state = 'data'; |
837
|
|
|
|
838
|
|
|
} elseif ($char === '/') { |
839
|
|
|
/* U+002F SOLIDUS (/) |
840
|
|
|
Parse error unless this is a permitted slash. Switch to the before |
841
|
|
|
attribute name state. */ |
842
|
|
|
$this->state = 'beforeAttributeName'; |
843
|
|
|
|
844
|
|
|
} else { |
845
|
|
|
/* Anything else |
846
|
|
|
Append the current input character to the current tag token's tag name. |
847
|
|
|
Stay in the tag name state. */ |
848
|
|
|
$this->token['name'] .= strtolower($char); |
|
|
|
|
849
|
|
|
$this->state = 'tagName'; |
850
|
|
|
} |
851
|
|
|
} |
852
|
|
|
|
853
|
|
|
private function beforeAttributeNameState() |
854
|
|
|
{ |
855
|
|
|
// Consume the next input character: |
856
|
|
|
$this->char++; |
857
|
|
|
$char = $this->character($this->char); |
858
|
|
|
|
859
|
|
|
if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { |
|
|
|
|
860
|
|
|
/* U+0009 CHARACTER TABULATION |
861
|
|
|
U+000A LINE FEED (LF) |
862
|
|
|
U+000B LINE TABULATION |
863
|
|
|
U+000C FORM FEED (FF) |
864
|
|
|
U+0020 SPACE |
865
|
|
|
Stay in the before attribute name state. */ |
866
|
|
|
$this->state = 'beforeAttributeName'; |
867
|
|
|
|
868
|
|
|
} elseif ($char === '>') { |
869
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
870
|
|
|
Emit the current tag token. Switch to the data state. */ |
871
|
|
|
$this->emitToken($this->token); |
872
|
|
|
$this->state = 'data'; |
873
|
|
|
|
874
|
|
|
} elseif ($char === '/') { |
875
|
|
|
/* U+002F SOLIDUS (/) |
876
|
|
|
Parse error unless this is a permitted slash. Stay in the before |
877
|
|
|
attribute name state. */ |
878
|
|
|
$this->state = 'beforeAttributeName'; |
879
|
|
|
|
880
|
|
|
} elseif ($this->char === $this->EOF) { |
881
|
|
|
/* EOF |
882
|
|
|
Parse error. Emit the current tag token. Reconsume the EOF |
883
|
|
|
character in the data state. */ |
884
|
|
|
$this->emitToken($this->token); |
885
|
|
|
|
886
|
|
|
$this->char--; |
887
|
|
|
$this->state = 'data'; |
888
|
|
|
|
889
|
|
|
} else { |
890
|
|
|
/* Anything else |
891
|
|
|
Start a new attribute in the current tag token. Set that attribute's |
892
|
|
|
name to the current input character, and its value to the empty string. |
893
|
|
|
Switch to the attribute name state. */ |
894
|
|
|
$this->token['attr'][] = array( |
895
|
|
|
'name' => strtolower($char), |
|
|
|
|
896
|
|
|
'value' => null |
897
|
|
|
); |
898
|
|
|
|
899
|
|
|
$this->state = 'attributeName'; |
900
|
|
|
} |
901
|
|
|
} |
902
|
|
|
|
903
|
|
|
private function attributeNameState() |
904
|
|
|
{ |
905
|
|
|
// Consume the next input character: |
906
|
|
|
$this->char++; |
907
|
|
|
$char = $this->character($this->char); |
908
|
|
|
|
909
|
|
|
if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { |
|
|
|
|
910
|
|
|
/* U+0009 CHARACTER TABULATION |
911
|
|
|
U+000A LINE FEED (LF) |
912
|
|
|
U+000B LINE TABULATION |
913
|
|
|
U+000C FORM FEED (FF) |
914
|
|
|
U+0020 SPACE |
915
|
|
|
Stay in the before attribute name state. */ |
916
|
|
|
$this->state = 'afterAttributeName'; |
917
|
|
|
|
918
|
|
|
} elseif ($char === '=') { |
919
|
|
|
/* U+003D EQUALS SIGN (=) |
920
|
|
|
Switch to the before attribute value state. */ |
921
|
|
|
$this->state = 'beforeAttributeValue'; |
922
|
|
|
|
923
|
|
|
} elseif ($char === '>') { |
924
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
925
|
|
|
Emit the current tag token. Switch to the data state. */ |
926
|
|
|
$this->emitToken($this->token); |
927
|
|
|
$this->state = 'data'; |
928
|
|
|
|
929
|
|
|
} elseif ($char === '/' && $this->character($this->char + 1) !== '>') { |
930
|
|
|
/* U+002F SOLIDUS (/) |
931
|
|
|
Parse error unless this is a permitted slash. Switch to the before |
932
|
|
|
attribute name state. */ |
933
|
|
|
$this->state = 'beforeAttributeName'; |
934
|
|
|
|
935
|
|
|
} elseif ($this->char === $this->EOF) { |
936
|
|
|
/* EOF |
937
|
|
|
Parse error. Emit the current tag token. Reconsume the EOF |
938
|
|
|
character in the data state. */ |
939
|
|
|
$this->emitToken($this->token); |
940
|
|
|
|
941
|
|
|
$this->char--; |
942
|
|
|
$this->state = 'data'; |
943
|
|
|
|
944
|
|
|
} else { |
945
|
|
|
/* Anything else |
946
|
|
|
Append the current input character to the current attribute's name. |
947
|
|
|
Stay in the attribute name state. */ |
948
|
|
|
$last = count($this->token['attr']) - 1; |
949
|
|
|
$this->token['attr'][$last]['name'] .= strtolower($char); |
950
|
|
|
|
951
|
|
|
$this->state = 'attributeName'; |
952
|
|
|
} |
953
|
|
|
} |
954
|
|
|
|
955
|
|
|
private function afterAttributeNameState() |
956
|
|
|
{ |
957
|
|
|
// Consume the next input character: |
958
|
|
|
$this->char++; |
959
|
|
|
$char = $this->character($this->char); |
960
|
|
|
|
961
|
|
|
if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { |
|
|
|
|
962
|
|
|
/* U+0009 CHARACTER TABULATION |
963
|
|
|
U+000A LINE FEED (LF) |
964
|
|
|
U+000B LINE TABULATION |
965
|
|
|
U+000C FORM FEED (FF) |
966
|
|
|
U+0020 SPACE |
967
|
|
|
Stay in the after attribute name state. */ |
968
|
|
|
$this->state = 'afterAttributeName'; |
969
|
|
|
|
970
|
|
|
} elseif ($char === '=') { |
971
|
|
|
/* U+003D EQUALS SIGN (=) |
972
|
|
|
Switch to the before attribute value state. */ |
973
|
|
|
$this->state = 'beforeAttributeValue'; |
974
|
|
|
|
975
|
|
|
} elseif ($char === '>') { |
976
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
977
|
|
|
Emit the current tag token. Switch to the data state. */ |
978
|
|
|
$this->emitToken($this->token); |
979
|
|
|
$this->state = 'data'; |
980
|
|
|
|
981
|
|
|
} elseif ($char === '/' && $this->character($this->char + 1) !== '>') { |
982
|
|
|
/* U+002F SOLIDUS (/) |
983
|
|
|
Parse error unless this is a permitted slash. Switch to the |
984
|
|
|
before attribute name state. */ |
985
|
|
|
$this->state = 'beforeAttributeName'; |
986
|
|
|
|
987
|
|
|
} elseif ($this->char === $this->EOF) { |
988
|
|
|
/* EOF |
989
|
|
|
Parse error. Emit the current tag token. Reconsume the EOF |
990
|
|
|
character in the data state. */ |
991
|
|
|
$this->emitToken($this->token); |
992
|
|
|
|
993
|
|
|
$this->char--; |
994
|
|
|
$this->state = 'data'; |
995
|
|
|
|
996
|
|
|
} else { |
997
|
|
|
/* Anything else |
998
|
|
|
Start a new attribute in the current tag token. Set that attribute's |
999
|
|
|
name to the current input character, and its value to the empty string. |
1000
|
|
|
Switch to the attribute name state. */ |
1001
|
|
|
$this->token['attr'][] = array( |
1002
|
|
|
'name' => strtolower($char), |
1003
|
|
|
'value' => null |
1004
|
|
|
); |
1005
|
|
|
|
1006
|
|
|
$this->state = 'attributeName'; |
1007
|
|
|
} |
1008
|
|
|
} |
1009
|
|
|
|
1010
|
|
|
private function beforeAttributeValueState() |
1011
|
|
|
{ |
1012
|
|
|
// Consume the next input character: |
1013
|
|
|
$this->char++; |
1014
|
|
|
$char = $this->character($this->char); |
1015
|
|
|
|
1016
|
|
|
if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { |
|
|
|
|
1017
|
|
|
/* U+0009 CHARACTER TABULATION |
1018
|
|
|
U+000A LINE FEED (LF) |
1019
|
|
|
U+000B LINE TABULATION |
1020
|
|
|
U+000C FORM FEED (FF) |
1021
|
|
|
U+0020 SPACE |
1022
|
|
|
Stay in the before attribute value state. */ |
1023
|
|
|
$this->state = 'beforeAttributeValue'; |
1024
|
|
|
|
1025
|
|
|
} elseif ($char === '"') { |
1026
|
|
|
/* U+0022 QUOTATION MARK (") |
1027
|
|
|
Switch to the attribute value (double-quoted) state. */ |
1028
|
|
|
$this->state = 'attributeValueDoubleQuoted'; |
1029
|
|
|
|
1030
|
|
|
} elseif ($char === '&') { |
1031
|
|
|
/* U+0026 AMPERSAND (&) |
1032
|
|
|
Switch to the attribute value (unquoted) state and reconsume |
1033
|
|
|
this input character. */ |
1034
|
|
|
$this->char--; |
1035
|
|
|
$this->state = 'attributeValueUnquoted'; |
1036
|
|
|
|
1037
|
|
|
} elseif ($char === '\'') { |
1038
|
|
|
/* U+0027 APOSTROPHE (') |
1039
|
|
|
Switch to the attribute value (single-quoted) state. */ |
1040
|
|
|
$this->state = 'attributeValueSingleQuoted'; |
1041
|
|
|
|
1042
|
|
|
} elseif ($char === '>') { |
1043
|
|
|
/* U+003E GREATER-THAN SIGN (>) |
1044
|
|
|
Emit the current tag token. Switch to the data state. */ |
1045
|
|
|
$this->emitToken($this->token); |
1046
|
|
|
$this->state = 'data'; |
1047
|
|
|
|
1048
|
|
|
} else { |
1049
|
|
|
/* Anything else |
1050
|
|
|
Append the current input character to the current attribute's value. |
1051
|
|
|
Switch to the attribute value (unquoted) state. */ |
1052
|
|
|
$last = count($this->token['attr']) - 1; |
1053
|
|
|
$this->token['attr'][$last]['value'] .= $char; |
1054
|
|
|
|
1055
|
|
|
$this->state = 'attributeValueUnquoted'; |
1056
|
|
|
} |
1057
|
|
|
} |
1058
|
|
|
|
1059
|
|
|
private function attributeValueDoubleQuotedState() |
1060
|
|
|
{ |
1061
|
|
|
// Consume the next input character: |
1062
|
|
|
$this->char++; |
1063
|
|
|
$char = $this->character($this->char); |
1064
|
|
|
|
1065
|
|
|
if ($char === '"') { |
1066
|
|
|
/* U+0022 QUOTATION MARK (") |
1067
|
|
|
Switch to the before attribute name state. */ |
1068
|
|
|
$this->state = 'beforeAttributeName'; |
1069
|
|
|
|
1070
|
|
|
} elseif ($char === '&') { |
1071
|
|
|
/* U+0026 AMPERSAND (&) |
1072
|
|
|
Switch to the entity in attribute value state. */ |
1073
|
|
|
$this->entityInAttributeValueState('double'); |
|
|
|
|
1074
|
|
|
|
1075
|
|
|
} elseif ($this->char === $this->EOF) { |
1076
|
|
|
/* EOF |
1077
|
|
|
Parse error. Emit the current tag token. Reconsume the character |
1078
|
|
|
in the data state. */ |
1079
|
|
|
$this->emitToken($this->token); |
1080
|
|
|
|
1081
|
|
|
$this->char--; |
1082
|
|
|
$this->state = 'data'; |
1083
|
|
|
|
1084
|
|
|
} else { |
1085
|
|
|
/* Anything else |
1086
|
|
|
Append the current input character to the current attribute's value. |
1087
|
|
|
Stay in the attribute value (double-quoted) state. */ |
1088
|
|
|
$last = count($this->token['attr']) - 1; |
1089
|
|
|
$this->token['attr'][$last]['value'] .= $char; |
1090
|
|
|
|
1091
|
|
|
$this->state = 'attributeValueDoubleQuoted'; |
1092
|
|
|
} |
1093
|
|
|
} |
1094
|
|
|
|
1095
|
|