1
|
|
|
<?php |
2
|
|
|
namespace Masterminds\HTML5\Parser; |
3
|
|
|
|
4
|
|
|
use Masterminds\HTML5\Elements; |
5
|
|
|
|
6
|
|
|
/** |
7
|
|
|
* Create an HTML5 DOM tree from events. |
8
|
|
|
* |
9
|
|
|
* This attempts to create a DOM from events emitted by a parser. This |
10
|
|
|
* attempts (but does not guarantee) to up-convert older HTML documents |
11
|
|
|
* to HTML5. It does this by applying HTML5's rules, but it will not |
12
|
|
|
* change the architecture of the document itself. |
13
|
|
|
* |
14
|
|
|
* Many of the error correction and quirks features suggested in the specification |
15
|
|
|
* are implemented herein; however, not all of them are. Since we do not |
16
|
|
|
* assume a graphical user agent, no presentation-specific logic is conducted |
17
|
|
|
* during tree building. |
18
|
|
|
* |
19
|
|
|
* FIXME: The present tree builder does not exactly follow the state machine rules |
20
|
|
|
* for insert modes as outlined in the HTML5 spec. The processor needs to be |
21
|
|
|
* re-written to accomodate this. See, for example, the Go language HTML5 |
22
|
|
|
* parser. |
23
|
|
|
*/ |
24
|
|
|
class DOMTreeBuilder implements EventHandler |
25
|
|
|
{ |
26
|
|
|
/** |
27
|
|
|
* Defined in http://www.w3.org/TR/html51/infrastructure.html#html-namespace-0 |
28
|
|
|
*/ |
29
|
|
|
const NAMESPACE_HTML = 'http://www.w3.org/1999/xhtml'; |
30
|
|
|
|
31
|
|
|
const NAMESPACE_MATHML = 'http://www.w3.org/1998/Math/MathML'; |
32
|
|
|
|
33
|
|
|
const NAMESPACE_SVG = 'http://www.w3.org/2000/svg'; |
34
|
|
|
|
35
|
|
|
const NAMESPACE_XLINK = 'http://www.w3.org/1999/xlink'; |
36
|
|
|
|
37
|
|
|
const NAMESPACE_XML = 'http://www.w3.org/XML/1998/namespace'; |
38
|
|
|
|
39
|
|
|
const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/'; |
40
|
|
|
|
41
|
|
|
const OPT_DISABLE_HTML_NS = 'disable_html_ns'; |
42
|
|
|
|
43
|
|
|
const OPT_TARGET_DOC = 'target_document'; |
44
|
|
|
|
45
|
|
|
const OPT_IMPLICIT_NS = 'implicit_namespaces'; |
46
|
|
|
|
47
|
|
|
/** |
48
|
|
|
* Holds the HTML5 element names that causes a namespace switch |
49
|
|
|
* |
50
|
|
|
* @var array |
51
|
|
|
*/ |
52
|
|
|
protected $nsRoots = array( |
53
|
|
|
'html' => self::NAMESPACE_HTML, |
54
|
|
|
'svg' => self::NAMESPACE_SVG, |
55
|
|
|
'math' => self::NAMESPACE_MATHML |
56
|
|
|
); |
57
|
|
|
|
58
|
|
|
/** |
59
|
|
|
* Holds the always available namespaces (which does not require the XMLNS declaration). |
60
|
|
|
* |
61
|
|
|
* @var array |
62
|
|
|
*/ |
63
|
|
|
protected $implicitNamespaces = array( |
64
|
|
|
'xml' => self::NAMESPACE_XML, |
65
|
|
|
'xmlns' => self::NAMESPACE_XMLNS, |
66
|
|
|
'xlink' => self::NAMESPACE_XLINK |
67
|
|
|
); |
68
|
|
|
|
69
|
|
|
/** |
70
|
|
|
* Holds a stack of currently active namespaces. |
71
|
|
|
* |
72
|
|
|
* @var array |
73
|
|
|
*/ |
74
|
|
|
protected $nsStack = array(); |
75
|
|
|
|
76
|
|
|
/** |
77
|
|
|
* Holds the number of namespaces declared by a node. |
78
|
|
|
* |
79
|
|
|
* @var array |
80
|
|
|
*/ |
81
|
|
|
protected $pushes = array(); |
82
|
|
|
|
83
|
|
|
/** |
84
|
|
|
* Defined in 8.2.5. |
85
|
|
|
*/ |
86
|
|
|
const IM_INITIAL = 0; |
87
|
|
|
|
88
|
|
|
const IM_BEFORE_HTML = 1; |
89
|
|
|
|
90
|
|
|
const IM_BEFORE_HEAD = 2; |
91
|
|
|
|
92
|
|
|
const IM_IN_HEAD = 3; |
93
|
|
|
|
94
|
|
|
const IM_IN_HEAD_NOSCRIPT = 4; |
95
|
|
|
|
96
|
|
|
const IM_AFTER_HEAD = 5; |
97
|
|
|
|
98
|
|
|
const IM_IN_BODY = 6; |
99
|
|
|
|
100
|
|
|
const IM_TEXT = 7; |
101
|
|
|
|
102
|
|
|
const IM_IN_TABLE = 8; |
103
|
|
|
|
104
|
|
|
const IM_IN_TABLE_TEXT = 9; |
105
|
|
|
|
106
|
|
|
const IM_IN_CAPTION = 10; |
107
|
|
|
|
108
|
|
|
const IM_IN_COLUMN_GROUP = 11; |
109
|
|
|
|
110
|
|
|
const IM_IN_TABLE_BODY = 12; |
111
|
|
|
|
112
|
|
|
const IM_IN_ROW = 13; |
113
|
|
|
|
114
|
|
|
const IM_IN_CELL = 14; |
115
|
|
|
|
116
|
|
|
const IM_IN_SELECT = 15; |
117
|
|
|
|
118
|
|
|
const IM_IN_SELECT_IN_TABLE = 16; |
119
|
|
|
|
120
|
|
|
const IM_AFTER_BODY = 17; |
121
|
|
|
|
122
|
|
|
const IM_IN_FRAMESET = 18; |
123
|
|
|
|
124
|
|
|
const IM_AFTER_FRAMESET = 19; |
125
|
|
|
|
126
|
|
|
const IM_AFTER_AFTER_BODY = 20; |
127
|
|
|
|
128
|
|
|
const IM_AFTER_AFTER_FRAMESET = 21; |
129
|
|
|
|
130
|
|
|
const IM_IN_SVG = 22; |
131
|
|
|
|
132
|
|
|
const IM_IN_MATHML = 23; |
133
|
|
|
|
134
|
|
|
protected $options = array(); |
135
|
|
|
|
136
|
|
|
protected $stack = array(); |
137
|
|
|
|
138
|
|
|
protected $current; // Pointer in the tag hierarchy. |
139
|
|
|
protected $rules; |
140
|
|
|
protected $doc; |
141
|
|
|
|
142
|
|
|
protected $frag; |
143
|
|
|
|
144
|
|
|
protected $processor; |
145
|
|
|
|
146
|
|
|
protected $insertMode = 0; |
147
|
|
|
|
148
|
|
|
/** |
149
|
|
|
* Track if we are in an element that allows only inline child nodes |
150
|
|
|
* @var string|null |
151
|
|
|
*/ |
152
|
|
|
protected $onlyInline; |
153
|
|
|
|
154
|
|
|
/** |
155
|
|
|
* Quirks mode is enabled by default. |
156
|
|
|
* Any document that is missing the |
157
|
|
|
* DT will be considered to be in quirks mode. |
158
|
|
|
*/ |
159
|
|
|
protected $quirks = true; |
160
|
|
|
|
161
|
|
|
protected $errors = array(); |
162
|
|
|
|
163
|
110 |
|
public function __construct($isFragment = false, array $options = array()) |
164
|
|
|
{ |
165
|
110 |
|
$this->options = $options; |
166
|
|
|
|
167
|
110 |
|
if (isset($options[self::OPT_TARGET_DOC])) { |
168
|
1 |
|
$this->doc = $options[self::OPT_TARGET_DOC]; |
169
|
1 |
|
} else { |
170
|
109 |
|
$impl = new \DOMImplementation(); |
171
|
|
|
// XXX: |
172
|
|
|
// Create the doctype. For now, we are always creating HTML5 |
173
|
|
|
// documents, and attempting to up-convert any older DTDs to HTML5. |
174
|
109 |
|
$dt = $impl->createDocumentType('html'); |
175
|
|
|
// $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); |
176
|
109 |
|
$this->doc = $impl->createDocument(null, null, $dt); |
177
|
|
|
} |
178
|
110 |
|
$this->errors = array(); |
179
|
|
|
|
180
|
110 |
|
$this->current = $this->doc; // ->documentElement; |
181
|
|
|
|
182
|
|
|
// Create a rules engine for tags. |
183
|
110 |
|
$this->rules = new TreeBuildingRules($this->doc); |
184
|
|
|
|
185
|
110 |
|
$implicitNS = array(); |
186
|
110 |
|
if (isset($this->options[self::OPT_IMPLICIT_NS])) { |
187
|
|
|
$implicitNS = $this->options[self::OPT_IMPLICIT_NS]; |
188
|
110 |
|
} elseif (isset($this->options["implicitNamespaces"])) { |
189
|
2 |
|
$implicitNS = $this->options["implicitNamespaces"]; |
190
|
2 |
|
} |
191
|
|
|
|
192
|
|
|
// Fill $nsStack with the defalut HTML5 namespaces, plus the "implicitNamespaces" array taken form $options |
193
|
110 |
|
array_unshift($this->nsStack, $implicitNS + array( |
194
|
|
|
'' => self::NAMESPACE_HTML |
195
|
110 |
|
) + $this->implicitNamespaces); |
196
|
|
|
|
197
|
110 |
|
if ($isFragment) { |
198
|
18 |
|
$this->insertMode = static::IM_IN_BODY; |
199
|
18 |
|
$this->frag = $this->doc->createDocumentFragment(); |
200
|
18 |
|
$this->current = $this->frag; |
201
|
18 |
|
} |
202
|
110 |
|
} |
203
|
|
|
|
204
|
|
|
/** |
205
|
|
|
* Get the document. |
206
|
|
|
*/ |
207
|
100 |
|
public function document() |
208
|
|
|
{ |
209
|
100 |
|
return $this->doc; |
210
|
|
|
} |
211
|
|
|
|
212
|
|
|
/** |
213
|
|
|
* Get the DOM fragment for the body. |
214
|
|
|
* |
215
|
|
|
* This returns a DOMNodeList because a fragment may have zero or more |
216
|
|
|
* DOMNodes at its root. |
217
|
|
|
* |
218
|
|
|
* @see http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#concept-frag-parse-context |
219
|
|
|
* |
220
|
|
|
* @return \DOMDocumentFragment |
221
|
|
|
*/ |
222
|
18 |
|
public function fragment() |
223
|
|
|
{ |
224
|
18 |
|
return $this->frag; |
225
|
|
|
} |
226
|
|
|
|
227
|
|
|
/** |
228
|
|
|
* Provide an instruction processor. |
229
|
|
|
* |
230
|
|
|
* This is used for handling Processor Instructions as they are |
231
|
|
|
* inserted. If omitted, PI's are inserted directly into the DOM tree. |
232
|
|
|
*/ |
233
|
1 |
|
public function setInstructionProcessor(\Masterminds\HTML5\InstructionProcessor $proc) |
234
|
|
|
{ |
235
|
1 |
|
$this->processor = $proc; |
236
|
1 |
|
} |
237
|
|
|
|
238
|
94 |
|
public function doctype($name, $idType = 0, $id = null, $quirks = false) |
239
|
|
|
{ |
240
|
|
|
// This is used solely for setting quirks mode. Currently we don't |
241
|
|
|
// try to preserve the inbound DT. We convert it to HTML5. |
242
|
94 |
|
$this->quirks = $quirks; |
243
|
|
|
|
244
|
94 |
|
if ($this->insertMode > static::IM_INITIAL) { |
245
|
|
|
$this->parseError("Illegal placement of DOCTYPE tag. Ignoring: " . $name); |
246
|
|
|
|
247
|
|
|
return; |
248
|
|
|
} |
249
|
|
|
|
250
|
94 |
|
$this->insertMode = static::IM_BEFORE_HTML; |
251
|
94 |
|
} |
252
|
|
|
|
253
|
|
|
/** |
254
|
|
|
* Process the start tag. |
255
|
|
|
* |
256
|
|
|
* @todo - XMLNS namespace handling (we need to parse, even if it's not valid) |
257
|
|
|
* - XLink, MathML and SVG namespace handling |
258
|
|
|
* - Omission rules: 8.1.2.4 Optional tags |
259
|
|
|
*/ |
260
|
106 |
|
public function startTag($name, $attributes = array(), $selfClosing = false) |
261
|
|
|
{ |
262
|
|
|
// fprintf(STDOUT, $name); |
263
|
106 |
|
$lname = $this->normalizeTagName($name); |
264
|
|
|
|
265
|
|
|
// Make sure we have an html element. |
266
|
106 |
|
if (! $this->doc->documentElement && $name !== 'html' && ! $this->frag) { |
267
|
3 |
|
$this->startTag('html'); |
268
|
3 |
|
} |
269
|
|
|
|
270
|
|
|
// Set quirks mode if we're at IM_INITIAL with no doctype. |
271
|
106 |
|
if ($this->insertMode == static::IM_INITIAL) { |
272
|
5 |
|
$this->quirks = true; |
273
|
5 |
|
$this->parseError("No DOCTYPE specified."); |
274
|
5 |
|
} |
275
|
|
|
|
276
|
|
|
// SPECIAL TAG HANDLING: |
277
|
|
|
// Spec says do this, and "don't ask." |
278
|
|
|
// find the spec where this is defined... looks problematic |
279
|
106 |
|
if ($name == 'image' && !($this->insertMode === static::IM_IN_SVG || $this->insertMode === static::IM_IN_MATHML)) { |
280
|
|
|
$name = 'img'; |
281
|
|
|
} |
282
|
|
|
|
283
|
|
|
// Autoclose p tags where appropriate. |
284
|
106 |
|
if ($this->insertMode >= static::IM_IN_BODY && Elements::isA($name, Elements::AUTOCLOSE_P)) { |
285
|
54 |
|
$this->autoclose('p'); |
286
|
54 |
|
} |
287
|
|
|
|
288
|
|
|
// Set insert mode: |
289
|
|
|
switch ($name) { |
290
|
106 |
|
case 'html': |
291
|
99 |
|
$this->insertMode = static::IM_BEFORE_HEAD; |
292
|
99 |
|
break; |
293
|
100 |
|
case 'head': |
294
|
41 |
|
if ($this->insertMode > static::IM_BEFORE_HEAD) { |
295
|
|
|
$this->parseError("Unexpected head tag outside of head context."); |
296
|
|
|
} else { |
297
|
41 |
|
$this->insertMode = static::IM_IN_HEAD; |
298
|
|
|
} |
299
|
41 |
|
break; |
300
|
99 |
|
case 'body': |
301
|
84 |
|
$this->insertMode = static::IM_IN_BODY; |
302
|
84 |
|
break; |
303
|
93 |
|
case 'svg': |
304
|
8 |
|
$this->insertMode = static::IM_IN_SVG; |
305
|
8 |
|
break; |
306
|
93 |
|
case 'math': |
307
|
7 |
|
$this->insertMode = static::IM_IN_MATHML; |
308
|
7 |
|
break; |
309
|
90 |
|
case 'noscript': |
310
|
1 |
|
if ($this->insertMode == static::IM_IN_HEAD) { |
311
|
1 |
|
$this->insertMode = static::IM_IN_HEAD_NOSCRIPT; |
312
|
1 |
|
} |
313
|
1 |
|
break; |
314
|
|
|
} |
315
|
|
|
|
316
|
|
|
// Special case handling for SVG. |
317
|
106 |
|
if ($this->insertMode == static::IM_IN_SVG) { |
318
|
8 |
|
$lname = Elements::normalizeSvgElement($lname); |
319
|
8 |
|
} |
320
|
|
|
|
321
|
106 |
|
$pushes = 0; |
322
|
|
|
// when we found a tag thats appears inside $nsRoots, we have to switch the defalut namespace |
323
|
106 |
|
if (isset($this->nsRoots[$lname]) && $this->nsStack[0][''] !== $this->nsRoots[$lname]) { |
324
|
15 |
|
array_unshift($this->nsStack, array( |
325
|
15 |
|
'' => $this->nsRoots[$lname] |
326
|
15 |
|
) + $this->nsStack[0]); |
327
|
15 |
|
$pushes ++; |
328
|
15 |
|
} |
329
|
106 |
|
$needsWorkaround = false; |
330
|
106 |
|
if (isset($this->options["xmlNamespaces"]) && $this->options["xmlNamespaces"]) { |
331
|
|
|
// when xmlNamespaces is true a and we found a 'xmlns' or 'xmlns:*' attribute, we should add a new item to the $nsStack |
332
|
6 |
|
foreach ($attributes as $aName => $aVal) { |
333
|
5 |
|
if ($aName === 'xmlns') { |
334
|
3 |
|
$needsWorkaround = $aVal; |
335
|
3 |
|
array_unshift($this->nsStack, array( |
336
|
|
|
'' => $aVal |
337
|
3 |
|
) + $this->nsStack[0]); |
338
|
3 |
|
$pushes ++; |
339
|
5 |
|
} elseif ((($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : '') === 'xmlns') { |
340
|
3 |
|
array_unshift($this->nsStack, array( |
341
|
3 |
|
substr($aName, $pos + 1) => $aVal |
342
|
3 |
|
) + $this->nsStack[0]); |
343
|
3 |
|
$pushes ++; |
344
|
3 |
|
} |
345
|
6 |
|
} |
346
|
6 |
|
} |
347
|
|
|
|
348
|
106 |
|
if ($this->onlyInline && Elements::isA($lname, Elements::BLOCK_TAG)) { |
349
|
2 |
|
$this->autoclose($this->onlyInline); |
350
|
2 |
|
$this->onlyInline = null; |
351
|
2 |
|
} |
352
|
|
|
|
353
|
|
|
try { |
354
|
106 |
|
$prefix = ($pos = strpos($lname, ':')) ? substr($lname, 0, $pos) : ''; |
355
|
|
|
|
356
|
|
|
|
357
|
106 |
|
if ($needsWorkaround!==false) { |
358
|
|
|
|
359
|
3 |
|
$xml = "<$lname xmlns=\"$needsWorkaround\" ".(strlen($prefix) && isset($this->nsStack[0][$prefix])?("xmlns:$prefix=\"".$this->nsStack[0][$prefix]."\""):"")."/>"; |
360
|
|
|
|
361
|
3 |
|
$frag = new \DOMDocument('1.0', 'UTF-8'); |
362
|
3 |
|
$frag->loadXML($xml); |
363
|
|
|
|
364
|
3 |
|
$ele = $this->doc->importNode($frag->documentElement, true); |
365
|
|
|
|
366
|
3 |
|
} else { |
367
|
106 |
|
if (!isset($this->nsStack[0][$prefix]) || ($prefix === "" && isset($this->options[self::OPT_DISABLE_HTML_NS]) && $this->options[self::OPT_DISABLE_HTML_NS])) { |
368
|
2 |
|
$ele = $this->doc->createElement($lname); |
369
|
2 |
|
} else { |
370
|
105 |
|
$ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname); |
371
|
|
|
} |
372
|
|
|
} |
373
|
|
|
|
374
|
106 |
|
} catch (\DOMException $e) { |
375
|
|
|
$this->parseError("Illegal tag name: <$lname>. Replaced with <invalid>."); |
376
|
|
|
$ele = $this->doc->createElement('invalid'); |
377
|
|
|
} |
378
|
|
|
|
379
|
106 |
|
if (Elements::isA($lname, Elements::BLOCK_ONLY_INLINE)) { |
380
|
27 |
|
$this->onlyInline = $lname; |
381
|
27 |
|
} |
382
|
|
|
|
383
|
|
|
// When we add some namespacess, we have to track them. Later, when "endElement" is invoked, we have to remove them. |
384
|
|
|
// When we are on a void tag, we do not need to care about namesapce nesting. |
385
|
106 |
|
if ($pushes > 0 && !Elements::isA($name, Elements::VOID_TAG)) { |
386
|
|
|
// PHP tends to free the memory used by DOM, |
387
|
|
|
// to avoid spl_object_hash collisions whe have to avoid garbage collection of $ele storing it into $pushes |
388
|
|
|
// see https://bugs.php.net/bug.php?id=67459 |
389
|
17 |
|
$this->pushes[spl_object_hash($ele)] = array($pushes, $ele); |
390
|
|
|
|
391
|
|
|
// SEE https://github.com/facebook/hhvm/issues/2962 |
392
|
17 |
|
if (defined('HHVM_VERSION')) { |
393
|
|
|
$ele->setAttribute('html5-php-fake-id-attribute', spl_object_hash($ele)); |
394
|
|
|
} |
395
|
17 |
|
} |
396
|
|
|
|
397
|
106 |
|
foreach ($attributes as $aName => $aVal) { |
398
|
|
|
// xmlns attributes can't be set |
399
|
78 |
|
if ($aName === 'xmlns') { |
400
|
5 |
|
continue; |
401
|
|
|
} |
402
|
|
|
|
403
|
77 |
|
if ($this->insertMode == static::IM_IN_SVG) { |
404
|
8 |
|
$aName = Elements::normalizeSvgAttribute($aName); |
405
|
77 |
|
} elseif ($this->insertMode == static::IM_IN_MATHML) { |
406
|
4 |
|
$aName = Elements::normalizeMathMlAttribute($aName); |
407
|
4 |
|
} |
408
|
|
|
|
409
|
|
|
try { |
410
|
77 |
|
$prefix = ($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : false; |
411
|
|
|
|
412
|
77 |
|
if ($prefix==='xmlns') { |
413
|
4 |
|
$ele->setAttributeNs(self::NAMESPACE_XMLNS, $aName, $aVal); |
414
|
77 |
|
} elseif ($prefix!==false && isset($this->nsStack[0][$prefix])) { |
415
|
6 |
|
$ele->setAttributeNs($this->nsStack[0][$prefix], $aName, $aVal); |
416
|
6 |
|
} else { |
417
|
74 |
|
$ele->setAttribute($aName, $aVal); |
418
|
|
|
} |
419
|
77 |
|
} catch (\DOMException $e) { |
420
|
|
|
$this->parseError("Illegal attribute name for tag $name. Ignoring: $aName"); |
421
|
|
|
continue; |
422
|
|
|
} |
423
|
|
|
|
424
|
|
|
// This is necessary on a non-DTD schema, like HTML5. |
425
|
77 |
|
if ($aName == 'id') { |
426
|
24 |
|
$ele->setIdAttribute('id', true); |
427
|
24 |
|
} |
428
|
106 |
|
} |
429
|
|
|
|
430
|
|
|
// Some elements have special processing rules. Handle those separately. |
431
|
106 |
|
if ($this->rules->hasRules($name) && $this->frag !== $this->current) { |
432
|
6 |
|
$this->current = $this->rules->evaluate($ele, $this->current); |
433
|
6 |
|
} // Otherwise, it's a standard element. |
434
|
|
|
else { |
435
|
106 |
|
$this->current->appendChild($ele); |
436
|
|
|
|
437
|
106 |
|
if (! Elements::isA($name, Elements::VOID_TAG)) { |
438
|
106 |
|
$this->current = $ele; |
439
|
106 |
|
} |
440
|
|
|
|
441
|
|
|
// Self-closing tags should only be respected on foreign elements |
442
|
|
|
// (and are implied on void elements) |
443
|
|
|
// See: https://www.w3.org/TR/html5/syntax.html#start-tags |
444
|
106 |
|
if (Elements::isHtml5Element($name)) { |
445
|
105 |
|
$selfClosing = false; |
446
|
105 |
|
} |
447
|
|
|
} |
448
|
|
|
|
449
|
|
|
// This is sort of a last-ditch attempt to correct for cases where no head/body |
450
|
|
|
// elements are provided. |
451
|
106 |
|
if ($this->insertMode <= static::IM_BEFORE_HEAD && $name != 'head' && $name != 'html') { |
452
|
5 |
|
$this->insertMode = static::IM_IN_BODY; |
453
|
5 |
|
} |
454
|
|
|
|
455
|
|
|
// When we are on a void tag, we do not need to care about namesapce nesting, |
456
|
|
|
// but we have to remove the namespaces pushed to $nsStack. |
457
|
106 |
|
if ($pushes > 0 && Elements::isA($name, Elements::VOID_TAG)) { |
458
|
|
|
// remove the namespaced definded by current node |
459
|
|
|
for ($i = 0; $i < $pushes; $i ++) { |
460
|
|
|
array_shift($this->nsStack); |
461
|
|
|
} |
462
|
|
|
} |
463
|
|
|
|
464
|
106 |
|
if ($selfClosing) { |
465
|
7 |
|
$this->endTag($name); |
466
|
7 |
|
} |
467
|
|
|
|
468
|
|
|
// Return the element mask, which the tokenizer can then use to set |
469
|
|
|
// various processing rules. |
470
|
106 |
|
return Elements::element($name); |
|
|
|
|
471
|
|
|
} |
472
|
|
|
|
473
|
104 |
|
public function endTag($name) |
474
|
|
|
{ |
475
|
104 |
|
$lname = $this->normalizeTagName($name); |
476
|
|
|
|
477
|
|
|
// Ignore closing tags for unary elements. |
478
|
104 |
|
if (Elements::isA($name, Elements::VOID_TAG)) { |
479
|
|
|
return; |
480
|
|
|
} |
481
|
|
|
|
482
|
104 |
|
if ($this->insertMode <= static::IM_BEFORE_HTML) { |
483
|
|
|
// 8.2.5.4.2 |
484
|
|
|
if (in_array($name, array( |
485
|
|
|
'html', |
486
|
|
|
'br', |
487
|
|
|
'head', |
488
|
|
|
'title' |
489
|
|
|
))) { |
490
|
|
|
$this->startTag('html'); |
491
|
|
|
$this->endTag($name); |
492
|
|
|
$this->insertMode = static::IM_BEFORE_HEAD; |
493
|
|
|
|
494
|
|
|
return; |
495
|
|
|
} |
496
|
|
|
|
497
|
|
|
// Ignore the tag. |
498
|
|
|
$this->parseError("Illegal closing tag at global scope."); |
499
|
|
|
|
500
|
|
|
return; |
501
|
|
|
} |
502
|
|
|
|
503
|
|
|
// Special case handling for SVG. |
504
|
104 |
|
if ($this->insertMode == static::IM_IN_SVG) { |
505
|
8 |
|
$lname = Elements::normalizeSvgElement($lname); |
506
|
8 |
|
} |
507
|
|
|
|
508
|
|
|
// See https://github.com/facebook/hhvm/issues/2962 |
509
|
104 |
|
if (defined('HHVM_VERSION') && ($cid = $this->current->getAttribute('html5-php-fake-id-attribute'))) { |
510
|
|
|
$this->current->removeAttribute('html5-php-fake-id-attribute'); |
511
|
|
|
} else { |
512
|
104 |
|
$cid = spl_object_hash($this->current); |
513
|
|
|
} |
514
|
|
|
|
515
|
|
|
// XXX: Not sure whether we need this anymore. |
516
|
|
|
// if ($name != $lname) { |
517
|
|
|
// return $this->quirksTreeResolver($lname); |
518
|
|
|
// } |
519
|
|
|
|
520
|
|
|
// XXX: HTML has no parent. What do we do, though, |
521
|
|
|
// if this element appears in the wrong place? |
522
|
104 |
|
if ($lname == 'html') { |
523
|
95 |
|
return; |
524
|
|
|
} |
525
|
|
|
|
526
|
|
|
// remove the namespaced definded by current node |
527
|
98 |
|
if (isset($this->pushes[$cid])) { |
528
|
15 |
|
for ($i = 0; $i < $this->pushes[$cid][0]; $i ++) { |
529
|
15 |
|
array_shift($this->nsStack); |
530
|
15 |
|
} |
531
|
15 |
|
unset($this->pushes[$cid]); |
532
|
15 |
|
} |
533
|
|
|
|
534
|
98 |
|
if (! $this->autoclose($lname)) { |
535
|
2 |
|
$this->parseError('Could not find closing tag for ' . $lname); |
536
|
2 |
|
} |
537
|
|
|
|
538
|
|
|
// switch ($this->insertMode) { |
539
|
|
|
switch ($lname) { |
540
|
98 |
|
case "head": |
541
|
41 |
|
$this->insertMode = static::IM_AFTER_HEAD; |
542
|
41 |
|
break; |
543
|
97 |
|
case "body": |
544
|
85 |
|
$this->insertMode = static::IM_AFTER_BODY; |
545
|
85 |
|
break; |
546
|
82 |
|
case "svg": |
547
|
82 |
|
case "mathml": |
548
|
8 |
|
$this->insertMode = static::IM_IN_BODY; |
549
|
8 |
|
break; |
550
|
|
|
} |
551
|
98 |
|
} |
552
|
|
|
|
553
|
5 |
|
public function comment($cdata) |
554
|
|
|
{ |
555
|
|
|
// TODO: Need to handle case where comment appears outside of the HTML tag. |
556
|
5 |
|
$node = $this->doc->createComment($cdata); |
557
|
5 |
|
$this->current->appendChild($node); |
558
|
5 |
|
} |
559
|
|
|
|
560
|
87 |
|
public function text($data) |
561
|
|
|
{ |
562
|
|
|
// XXX: Hmmm.... should we really be this strict? |
563
|
87 |
|
if ($this->insertMode < static::IM_IN_HEAD) { |
564
|
|
|
// Per '8.2.5.4.3 The "before head" insertion mode' the characters |
565
|
|
|
// " \t\n\r\f" should be ignored but no mention of a parse error. This is |
566
|
|
|
// practical as most documents contain these characters. Other text is not |
567
|
|
|
// expected here so recording a parse error is necessary. |
568
|
56 |
|
$dataTmp = trim($data, " \t\n\r\f"); |
569
|
56 |
|
if (! empty($dataTmp)) { |
570
|
|
|
// fprintf(STDOUT, "Unexpected insert mode: %d", $this->insertMode); |
571
|
1 |
|
$this->parseError("Unexpected text. Ignoring: " . $dataTmp); |
572
|
1 |
|
} |
573
|
|
|
|
574
|
56 |
|
return; |
575
|
|
|
} |
576
|
|
|
// fprintf(STDOUT, "Appending text %s.", $data); |
577
|
86 |
|
$node = $this->doc->createTextNode($data); |
578
|
86 |
|
$this->current->appendChild($node); |
579
|
86 |
|
} |
580
|
|
|
|
581
|
110 |
|
public function eof() |
582
|
|
|
{ |
583
|
|
|
// If the $current isn't the $root, do we need to do anything? |
584
|
110 |
|
} |
585
|
|
|
|
586
|
11 |
|
public function parseError($msg, $line = 0, $col = 0) |
587
|
|
|
{ |
588
|
11 |
|
$this->errors[] = sprintf("Line %d, Col %d: %s", $line, $col, $msg); |
589
|
11 |
|
} |
590
|
|
|
|
591
|
104 |
|
public function getErrors() |
592
|
|
|
{ |
593
|
104 |
|
return $this->errors; |
594
|
|
|
} |
595
|
|
|
|
596
|
3 |
|
public function cdata($data) |
597
|
|
|
{ |
598
|
3 |
|
$node = $this->doc->createCDATASection($data); |
599
|
3 |
|
$this->current->appendChild($node); |
600
|
3 |
|
} |
601
|
|
|
|
602
|
5 |
|
public function processingInstruction($name, $data = null) |
603
|
|
|
{ |
604
|
|
|
// XXX: Ignore initial XML declaration, per the spec. |
605
|
5 |
|
if ($this->insertMode == static::IM_INITIAL && 'xml' == strtolower($name)) { |
606
|
1 |
|
return; |
607
|
|
|
} |
608
|
|
|
|
609
|
|
|
// Important: The processor may modify the current DOM tree however |
610
|
|
|
// it sees fit. |
611
|
5 |
|
if (isset($this->processor)) { |
612
|
1 |
|
$res = $this->processor->process($this->current, $name, $data); |
|
|
|
|
613
|
1 |
|
if (! empty($res)) { |
614
|
1 |
|
$this->current = $res; |
615
|
1 |
|
} |
616
|
|
|
|
617
|
1 |
|
return; |
618
|
|
|
} |
619
|
|
|
|
620
|
|
|
// Otherwise, this is just a dumb PI element. |
621
|
4 |
|
$node = $this->doc->createProcessingInstruction($name, $data); |
622
|
|
|
|
623
|
4 |
|
$this->current->appendChild($node); |
624
|
4 |
|
} |
625
|
|
|
|
626
|
|
|
// ========================================================================== |
627
|
|
|
// UTILITIES |
628
|
|
|
// ========================================================================== |
629
|
|
|
|
630
|
|
|
/** |
631
|
|
|
* Apply normalization rules to a tag name. |
632
|
|
|
* |
633
|
|
|
* See sections 2.9 and 8.1.2. |
634
|
|
|
* |
635
|
|
|
* @param string $name |
636
|
|
|
* The tag name. |
637
|
|
|
* @return string The normalized tag name. |
638
|
|
|
*/ |
639
|
106 |
|
protected function normalizeTagName($name) |
640
|
|
|
{ |
641
|
|
|
/* |
642
|
|
|
* Section 2.9 suggests that we should not do this. if (strpos($name, ':') !== false) { // We know from the grammar that there must be at least one other // char besides :, since : is not a legal tag start. $parts = explode(':', $name); return array_pop($parts); } |
643
|
|
|
*/ |
644
|
106 |
|
return $name; |
645
|
|
|
} |
646
|
|
|
|
647
|
|
|
protected function quirksTreeResolver($name) |
648
|
|
|
{ |
649
|
|
|
throw new \Exception("Not implemented."); |
650
|
|
|
} |
651
|
|
|
|
652
|
|
|
/** |
653
|
|
|
* Automatically climb the tree and close the closest node with the matching $tag. |
654
|
|
|
* |
655
|
|
|
* @param string $tagName |
656
|
|
|
* |
657
|
|
|
* @return bool |
658
|
|
|
*/ |
659
|
98 |
|
protected function autoclose($tagName) |
660
|
|
|
{ |
661
|
98 |
|
$working = $this->current; |
662
|
|
|
do { |
663
|
98 |
|
if ($working->nodeType != XML_ELEMENT_NODE) { |
664
|
54 |
|
return false; |
665
|
|
|
} |
666
|
98 |
|
if ($working->tagName == $tagName) { |
667
|
98 |
|
$this->current = $working->parentNode; |
668
|
|
|
|
669
|
98 |
|
return true; |
670
|
|
|
} |
671
|
51 |
|
} while ($working = $working->parentNode); |
672
|
|
|
return false; |
673
|
|
|
} |
674
|
|
|
|
675
|
|
|
/** |
676
|
|
|
* Checks if the given tagname is an ancestor of the present candidate. |
677
|
|
|
* |
678
|
|
|
* If $this->current or anything above $this->current matches the given tag |
679
|
|
|
* name, this returns true. |
680
|
|
|
* |
681
|
|
|
* @param string $tagName |
682
|
|
|
* |
683
|
|
|
* @return bool |
684
|
|
|
*/ |
685
|
|
|
protected function isAncestor($tagName) |
686
|
|
|
{ |
687
|
|
|
$candidate = $this->current; |
688
|
|
|
while ($candidate->nodeType === XML_ELEMENT_NODE) { |
689
|
|
|
if ($candidate->tagName == $tagName) { |
690
|
|
|
return true; |
691
|
|
|
} |
692
|
|
|
$candidate = $candidate->parentNode; |
693
|
|
|
} |
694
|
|
|
|
695
|
|
|
return false; |
696
|
|
|
} |
697
|
|
|
|
698
|
|
|
/** |
699
|
|
|
* Returns true if the immediate parent element is of the given tagname. |
700
|
|
|
* |
701
|
|
|
* @param string $tagName |
702
|
|
|
* |
703
|
|
|
* @return bool |
704
|
|
|
*/ |
705
|
|
|
protected function isParent($tagName) |
706
|
|
|
{ |
707
|
|
|
return $this->current->tagName == $tagName; |
708
|
|
|
} |
709
|
|
|
} |
710
|
|
|
|