1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Masterminds\HTML5\Parser; |
4
|
|
|
|
5
|
|
|
use Masterminds\HTML5\Elements; |
6
|
|
|
use Masterminds\HTML5\InstructionProcessor; |
7
|
|
|
|
8
|
|
|
/** |
9
|
|
|
* Create an HTML5 DOM tree from events. |
10
|
|
|
* |
11
|
|
|
* This attempts to create a DOM from events emitted by a parser. This |
12
|
|
|
* attempts (but does not guarantee) to up-convert older HTML documents |
13
|
|
|
* to HTML5. It does this by applying HTML5's rules, but it will not |
14
|
|
|
* change the architecture of the document itself. |
15
|
|
|
* |
16
|
|
|
* Many of the error correction and quirks features suggested in the specification |
17
|
|
|
* are implemented herein; however, not all of them are. Since we do not |
18
|
|
|
* assume a graphical user agent, no presentation-specific logic is conducted |
19
|
|
|
* during tree building. |
20
|
|
|
* |
21
|
|
|
* FIXME: The present tree builder does not exactly follow the state machine rules |
22
|
|
|
* for insert modes as outlined in the HTML5 spec. The processor needs to be |
23
|
|
|
* re-written to accomodate this. See, for example, the Go language HTML5 |
24
|
|
|
* parser. |
25
|
|
|
*/ |
26
|
|
|
class DOMTreeBuilder implements EventHandler |
27
|
|
|
{ |
28
|
|
|
/** |
29
|
|
|
* Defined in http://www.w3.org/TR/html51/infrastructure.html#html-namespace-0. |
30
|
|
|
*/ |
31
|
|
|
const NAMESPACE_HTML = 'http://www.w3.org/1999/xhtml'; |
32
|
|
|
|
33
|
|
|
const NAMESPACE_MATHML = 'http://www.w3.org/1998/Math/MathML'; |
34
|
|
|
|
35
|
|
|
const NAMESPACE_SVG = 'http://www.w3.org/2000/svg'; |
36
|
|
|
|
37
|
|
|
const NAMESPACE_XLINK = 'http://www.w3.org/1999/xlink'; |
38
|
|
|
|
39
|
|
|
const NAMESPACE_XML = 'http://www.w3.org/XML/1998/namespace'; |
40
|
|
|
|
41
|
|
|
const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/'; |
42
|
|
|
|
43
|
|
|
const OPT_DISABLE_HTML_NS = 'disable_html_ns'; |
44
|
|
|
|
45
|
|
|
const OPT_TARGET_DOC = 'target_document'; |
46
|
|
|
|
47
|
|
|
const OPT_IMPLICIT_NS = 'implicit_namespaces'; |
48
|
|
|
|
49
|
|
|
/** |
50
|
|
|
* Holds the HTML5 element names that causes a namespace switch. |
51
|
|
|
* |
52
|
|
|
* @var array |
53
|
|
|
*/ |
54
|
|
|
protected $nsRoots = array( |
55
|
|
|
'html' => self::NAMESPACE_HTML, |
56
|
|
|
'svg' => self::NAMESPACE_SVG, |
57
|
|
|
'math' => self::NAMESPACE_MATHML, |
58
|
|
|
); |
59
|
|
|
|
60
|
|
|
/** |
61
|
|
|
* Holds the always available namespaces (which does not require the XMLNS declaration). |
62
|
|
|
* |
63
|
|
|
* @var array |
64
|
|
|
*/ |
65
|
|
|
protected $implicitNamespaces = array( |
66
|
|
|
'xml' => self::NAMESPACE_XML, |
67
|
|
|
'xmlns' => self::NAMESPACE_XMLNS, |
68
|
|
|
'xlink' => self::NAMESPACE_XLINK, |
69
|
|
|
); |
70
|
|
|
|
71
|
|
|
/** |
72
|
|
|
* Holds a stack of currently active namespaces. |
73
|
|
|
* |
74
|
|
|
* @var array |
75
|
|
|
*/ |
76
|
|
|
protected $nsStack = array(); |
77
|
|
|
|
78
|
|
|
/** |
79
|
|
|
* Holds the number of namespaces declared by a node. |
80
|
|
|
* |
81
|
|
|
* @var array |
82
|
|
|
*/ |
83
|
|
|
protected $pushes = array(); |
84
|
|
|
|
85
|
|
|
/** |
86
|
|
|
* Defined in 8.2.5. |
87
|
|
|
*/ |
88
|
|
|
const IM_INITIAL = 0; |
89
|
|
|
|
90
|
|
|
const IM_BEFORE_HTML = 1; |
91
|
|
|
|
92
|
|
|
const IM_BEFORE_HEAD = 2; |
93
|
|
|
|
94
|
|
|
const IM_IN_HEAD = 3; |
95
|
|
|
|
96
|
|
|
const IM_IN_HEAD_NOSCRIPT = 4; |
97
|
|
|
|
98
|
|
|
const IM_AFTER_HEAD = 5; |
99
|
|
|
|
100
|
|
|
const IM_IN_BODY = 6; |
101
|
|
|
|
102
|
|
|
const IM_TEXT = 7; |
103
|
|
|
|
104
|
|
|
const IM_IN_TABLE = 8; |
105
|
|
|
|
106
|
|
|
const IM_IN_TABLE_TEXT = 9; |
107
|
|
|
|
108
|
|
|
const IM_IN_CAPTION = 10; |
109
|
|
|
|
110
|
|
|
const IM_IN_COLUMN_GROUP = 11; |
111
|
|
|
|
112
|
|
|
const IM_IN_TABLE_BODY = 12; |
113
|
|
|
|
114
|
|
|
const IM_IN_ROW = 13; |
115
|
|
|
|
116
|
|
|
const IM_IN_CELL = 14; |
117
|
|
|
|
118
|
|
|
const IM_IN_SELECT = 15; |
119
|
|
|
|
120
|
|
|
const IM_IN_SELECT_IN_TABLE = 16; |
121
|
|
|
|
122
|
|
|
const IM_AFTER_BODY = 17; |
123
|
|
|
|
124
|
|
|
const IM_IN_FRAMESET = 18; |
125
|
|
|
|
126
|
|
|
const IM_AFTER_FRAMESET = 19; |
127
|
|
|
|
128
|
|
|
const IM_AFTER_AFTER_BODY = 20; |
129
|
|
|
|
130
|
|
|
const IM_AFTER_AFTER_FRAMESET = 21; |
131
|
|
|
|
132
|
|
|
const IM_IN_SVG = 22; |
133
|
|
|
|
134
|
|
|
const IM_IN_MATHML = 23; |
135
|
|
|
|
136
|
|
|
protected $options = array(); |
137
|
|
|
|
138
|
|
|
protected $stack = array(); |
139
|
|
|
|
140
|
|
|
protected $current; // Pointer in the tag hierarchy. |
141
|
|
|
protected $rules; |
142
|
|
|
protected $doc; |
143
|
|
|
|
144
|
|
|
protected $frag; |
145
|
|
|
|
146
|
|
|
protected $processor; |
147
|
|
|
|
148
|
|
|
protected $insertMode = 0; |
149
|
|
|
|
150
|
|
|
/** |
151
|
|
|
* Track if we are in an element that allows only inline child nodes. |
152
|
|
|
* |
153
|
|
|
* @var string|null |
154
|
|
|
*/ |
155
|
|
|
protected $onlyInline; |
156
|
|
|
|
157
|
|
|
/** |
158
|
|
|
* Quirks mode is enabled by default. |
159
|
|
|
* Any document that is missing the DT will be considered to be in quirks mode. |
160
|
|
|
*/ |
161
|
|
|
protected $quirks = true; |
162
|
|
|
|
163
|
|
|
protected $errors = array(); |
164
|
|
|
|
165
|
115 |
|
public function __construct($isFragment = false, array $options = array()) |
166
|
|
|
{ |
167
|
115 |
|
$this->options = $options; |
168
|
|
|
|
169
|
115 |
|
if (isset($options[self::OPT_TARGET_DOC])) { |
170
|
1 |
|
$this->doc = $options[self::OPT_TARGET_DOC]; |
171
|
1 |
|
} else { |
172
|
114 |
|
$impl = new \DOMImplementation(); |
173
|
|
|
// XXX: |
174
|
|
|
// Create the doctype. For now, we are always creating HTML5 |
175
|
|
|
// documents, and attempting to up-convert any older DTDs to HTML5. |
176
|
114 |
|
$dt = $impl->createDocumentType('html'); |
177
|
|
|
// $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); |
178
|
114 |
|
$this->doc = $impl->createDocument(null, null, $dt); |
179
|
114 |
|
$this->doc->encoding = !empty($options['encoding']) ? $options['encoding'] : 'UTF-8'; |
180
|
|
|
} |
181
|
|
|
|
182
|
115 |
|
$this->errors = array(); |
183
|
|
|
|
184
|
115 |
|
$this->current = $this->doc; // ->documentElement; |
185
|
|
|
|
186
|
|
|
// Create a rules engine for tags. |
187
|
115 |
|
$this->rules = new TreeBuildingRules(); |
188
|
|
|
|
189
|
115 |
|
$implicitNS = array(); |
190
|
115 |
|
if (isset($this->options[self::OPT_IMPLICIT_NS])) { |
191
|
|
|
$implicitNS = $this->options[self::OPT_IMPLICIT_NS]; |
192
|
115 |
|
} elseif (isset($this->options['implicitNamespaces'])) { |
193
|
2 |
|
$implicitNS = $this->options['implicitNamespaces']; |
194
|
2 |
|
} |
195
|
|
|
|
196
|
|
|
// Fill $nsStack with the defalut HTML5 namespaces, plus the "implicitNamespaces" array taken form $options |
197
|
115 |
|
array_unshift($this->nsStack, $implicitNS + array('' => self::NAMESPACE_HTML) + $this->implicitNamespaces); |
198
|
|
|
|
199
|
115 |
|
if ($isFragment) { |
200
|
19 |
|
$this->insertMode = static::IM_IN_BODY; |
201
|
19 |
|
$this->frag = $this->doc->createDocumentFragment(); |
202
|
19 |
|
$this->current = $this->frag; |
203
|
19 |
|
} |
204
|
115 |
|
} |
205
|
|
|
|
206
|
|
|
/** |
207
|
|
|
* Get the document. |
208
|
|
|
*/ |
209
|
104 |
|
public function document() |
210
|
|
|
{ |
211
|
104 |
|
return $this->doc; |
212
|
|
|
} |
213
|
|
|
|
214
|
|
|
/** |
215
|
|
|
* Get the DOM fragment for the body. |
216
|
|
|
* |
217
|
|
|
* This returns a DOMNodeList because a fragment may have zero or more |
218
|
|
|
* DOMNodes at its root. |
219
|
|
|
* |
220
|
|
|
* @see http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#concept-frag-parse-context |
221
|
|
|
* |
222
|
|
|
* @return \DOMDocumentFragment |
223
|
|
|
*/ |
224
|
19 |
|
public function fragment() |
225
|
|
|
{ |
226
|
19 |
|
return $this->frag; |
227
|
|
|
} |
228
|
|
|
|
229
|
|
|
/** |
230
|
|
|
* Provide an instruction processor. |
231
|
|
|
* |
232
|
|
|
* This is used for handling Processor Instructions as they are |
233
|
|
|
* inserted. If omitted, PI's are inserted directly into the DOM tree. |
234
|
|
|
* |
235
|
|
|
* @param InstructionProcessor $proc |
236
|
|
|
*/ |
237
|
1 |
|
public function setInstructionProcessor(InstructionProcessor $proc) |
238
|
|
|
{ |
239
|
1 |
|
$this->processor = $proc; |
240
|
1 |
|
} |
241
|
|
|
|
242
|
97 |
|
public function doctype($name, $idType = 0, $id = null, $quirks = false) |
243
|
|
|
{ |
244
|
|
|
// This is used solely for setting quirks mode. Currently we don't |
245
|
|
|
// try to preserve the inbound DT. We convert it to HTML5. |
246
|
97 |
|
$this->quirks = $quirks; |
247
|
|
|
|
248
|
97 |
|
if ($this->insertMode > static::IM_INITIAL) { |
249
|
|
|
$this->parseError('Illegal placement of DOCTYPE tag. Ignoring: ' . $name); |
250
|
|
|
|
251
|
|
|
return; |
252
|
|
|
} |
253
|
|
|
|
254
|
97 |
|
$this->insertMode = static::IM_BEFORE_HTML; |
255
|
97 |
|
} |
256
|
|
|
|
257
|
|
|
/** |
258
|
|
|
* Process the start tag. |
259
|
|
|
* |
260
|
|
|
* @todo - XMLNS namespace handling (we need to parse, even if it's not valid) |
261
|
|
|
* - XLink, MathML and SVG namespace handling |
262
|
|
|
* - Omission rules: 8.1.2.4 Optional tags |
263
|
|
|
* |
264
|
|
|
* @param string $name |
265
|
|
|
* @param array $attributes |
266
|
|
|
* @param bool $selfClosing |
267
|
|
|
* |
268
|
|
|
* @return int |
269
|
|
|
*/ |
270
|
110 |
|
public function startTag($name, $attributes = array(), $selfClosing = false) |
271
|
|
|
{ |
272
|
110 |
|
$lname = $this->normalizeTagName($name); |
273
|
|
|
|
274
|
|
|
// Make sure we have an html element. |
275
|
110 |
|
if (!$this->doc->documentElement && 'html' !== $name && !$this->frag) { |
276
|
3 |
|
$this->startTag('html'); |
277
|
3 |
|
} |
278
|
|
|
|
279
|
|
|
// Set quirks mode if we're at IM_INITIAL with no doctype. |
280
|
110 |
|
if ($this->insertMode === static::IM_INITIAL) { |
281
|
6 |
|
$this->quirks = true; |
282
|
6 |
|
$this->parseError('No DOCTYPE specified.'); |
283
|
6 |
|
} |
284
|
|
|
|
285
|
|
|
// SPECIAL TAG HANDLING: |
286
|
|
|
// Spec says do this, and "don't ask." |
287
|
|
|
// find the spec where this is defined... looks problematic |
288
|
110 |
|
if ('image' === $name && !($this->insertMode === static::IM_IN_SVG || $this->insertMode === static::IM_IN_MATHML)) { |
289
|
|
|
$name = 'img'; |
290
|
|
|
} |
291
|
|
|
|
292
|
|
|
// Autoclose p tags where appropriate. |
293
|
110 |
|
if ($this->insertMode >= static::IM_IN_BODY && Elements::isA($name, Elements::AUTOCLOSE_P)) { |
294
|
56 |
|
$this->autoclose('p'); |
295
|
56 |
|
} |
296
|
|
|
|
297
|
|
|
// Set insert mode: |
298
|
|
|
switch ($name) { |
299
|
110 |
|
case 'html': |
300
|
103 |
|
$this->insertMode = static::IM_BEFORE_HEAD; |
301
|
103 |
|
break; |
302
|
104 |
|
case 'head': |
303
|
45 |
|
if ($this->insertMode > static::IM_BEFORE_HEAD) { |
304
|
1 |
|
$this->parseError('Unexpected head tag outside of head context.'); |
305
|
1 |
|
} else { |
306
|
45 |
|
$this->insertMode = static::IM_IN_HEAD; |
307
|
|
|
} |
308
|
45 |
|
break; |
309
|
103 |
|
case 'body': |
310
|
88 |
|
$this->insertMode = static::IM_IN_BODY; |
311
|
88 |
|
break; |
312
|
97 |
|
case 'svg': |
313
|
8 |
|
$this->insertMode = static::IM_IN_SVG; |
314
|
8 |
|
break; |
315
|
97 |
|
case 'math': |
316
|
7 |
|
$this->insertMode = static::IM_IN_MATHML; |
317
|
7 |
|
break; |
318
|
94 |
|
case 'noscript': |
319
|
1 |
|
if ($this->insertMode === static::IM_IN_HEAD) { |
320
|
1 |
|
$this->insertMode = static::IM_IN_HEAD_NOSCRIPT; |
321
|
1 |
|
} |
322
|
1 |
|
break; |
323
|
|
|
} |
324
|
|
|
|
325
|
|
|
// Case when no <body> exists, note section on 'Anything else' below. |
326
|
|
|
// https://html.spec.whatwg.org/multipage/parsing.html#the-after-head-insertion-mode |
327
|
110 |
|
if ($this->insertMode === static::IM_AFTER_HEAD && 'head' !== $name && 'body' !== $name) { |
328
|
|
|
$this->startTag('body'); |
329
|
|
|
} |
330
|
|
|
|
331
|
|
|
// Special case handling for SVG. |
332
|
110 |
|
if ($this->insertMode === static::IM_IN_SVG) { |
333
|
8 |
|
$lname = Elements::normalizeSvgElement($lname); |
334
|
8 |
|
} |
335
|
|
|
|
336
|
110 |
|
$pushes = 0; |
337
|
|
|
// when we found a tag thats appears inside $nsRoots, we have to switch the defalut namespace |
338
|
110 |
|
if (isset($this->nsRoots[$lname]) && $this->nsStack[0][''] !== $this->nsRoots[$lname]) { |
339
|
15 |
|
array_unshift($this->nsStack, array( |
340
|
15 |
|
'' => $this->nsRoots[$lname], |
341
|
15 |
|
) + $this->nsStack[0]); |
342
|
15 |
|
++$pushes; |
343
|
15 |
|
} |
344
|
110 |
|
$needsWorkaround = false; |
345
|
110 |
|
if (isset($this->options['xmlNamespaces']) && $this->options['xmlNamespaces']) { |
346
|
|
|
// when xmlNamespaces is true a and we found a 'xmlns' or 'xmlns:*' attribute, we should add a new item to the $nsStack |
347
|
6 |
|
foreach ($attributes as $aName => $aVal) { |
348
|
5 |
|
if ('xmlns' === $aName) { |
349
|
3 |
|
$needsWorkaround = $aVal; |
350
|
3 |
|
array_unshift($this->nsStack, array( |
351
|
3 |
|
'' => $aVal, |
352
|
3 |
|
) + $this->nsStack[0]); |
353
|
3 |
|
++$pushes; |
354
|
5 |
|
} elseif ('xmlns' === (($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : '')) { |
355
|
3 |
|
array_unshift($this->nsStack, array( |
356
|
3 |
|
substr($aName, $pos + 1) => $aVal, |
357
|
3 |
|
) + $this->nsStack[0]); |
358
|
3 |
|
++$pushes; |
359
|
3 |
|
} |
360
|
6 |
|
} |
361
|
6 |
|
} |
362
|
|
|
|
363
|
110 |
|
if ($this->onlyInline && Elements::isA($lname, Elements::BLOCK_TAG)) { |
364
|
2 |
|
$this->autoclose($this->onlyInline); |
365
|
2 |
|
$this->onlyInline = null; |
366
|
2 |
|
} |
367
|
|
|
|
368
|
|
|
try { |
369
|
110 |
|
$prefix = ($pos = strpos($lname, ':')) ? substr($lname, 0, $pos) : ''; |
370
|
|
|
|
371
|
110 |
|
if (false !== $needsWorkaround) { |
372
|
3 |
|
$xml = "<$lname xmlns=\"$needsWorkaround\" " . (strlen($prefix) && isset($this->nsStack[0][$prefix]) ? ("xmlns:$prefix=\"" . $this->nsStack[0][$prefix] . '"') : '') . '/>'; |
373
|
|
|
|
374
|
3 |
|
$frag = new \DOMDocument('1.0', 'UTF-8'); |
375
|
3 |
|
$frag->loadXML($xml); |
376
|
|
|
|
377
|
3 |
|
$ele = $this->doc->importNode($frag->documentElement, true); |
378
|
3 |
|
} else { |
379
|
110 |
|
if (!isset($this->nsStack[0][$prefix]) || ('' === $prefix && isset($this->options[self::OPT_DISABLE_HTML_NS]) && $this->options[self::OPT_DISABLE_HTML_NS])) { |
380
|
2 |
|
$ele = $this->doc->createElement($lname); |
381
|
2 |
|
} else { |
382
|
109 |
|
$ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname); |
383
|
|
|
} |
384
|
|
|
} |
385
|
110 |
|
} catch (\DOMException $e) { |
386
|
|
|
$this->parseError("Illegal tag name: <$lname>. Replaced with <invalid>."); |
387
|
|
|
$ele = $this->doc->createElement('invalid'); |
388
|
|
|
} |
389
|
|
|
|
390
|
110 |
|
if (Elements::isA($lname, Elements::BLOCK_ONLY_INLINE)) { |
391
|
29 |
|
$this->onlyInline = $lname; |
392
|
29 |
|
} |
393
|
|
|
|
394
|
|
|
// When we add some namespacess, we have to track them. Later, when "endElement" is invoked, we have to remove them. |
395
|
|
|
// When we are on a void tag, we do not need to care about namesapce nesting. |
396
|
110 |
|
if ($pushes > 0 && !Elements::isA($name, Elements::VOID_TAG)) { |
397
|
|
|
// PHP tends to free the memory used by DOM, |
398
|
|
|
// to avoid spl_object_hash collisions whe have to avoid garbage collection of $ele storing it into $pushes |
399
|
|
|
// see https://bugs.php.net/bug.php?id=67459 |
400
|
17 |
|
$this->pushes[spl_object_hash($ele)] = array($pushes, $ele); |
401
|
17 |
|
} |
402
|
|
|
|
403
|
110 |
|
foreach ($attributes as $aName => $aVal) { |
404
|
|
|
// xmlns attributes can't be set |
405
|
81 |
|
if ('xmlns' === $aName) { |
406
|
5 |
|
continue; |
407
|
|
|
} |
408
|
|
|
|
409
|
80 |
|
if ($this->insertMode === static::IM_IN_SVG) { |
410
|
8 |
|
$aName = Elements::normalizeSvgAttribute($aName); |
411
|
80 |
|
} elseif ($this->insertMode === static::IM_IN_MATHML) { |
412
|
4 |
|
$aName = Elements::normalizeMathMlAttribute($aName); |
413
|
4 |
|
} |
414
|
|
|
|
415
|
|
|
try { |
416
|
80 |
|
$prefix = ($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : false; |
417
|
|
|
|
418
|
80 |
|
if ('xmlns' === $prefix) { |
419
|
4 |
|
$ele->setAttributeNS(self::NAMESPACE_XMLNS, $aName, $aVal); |
420
|
80 |
|
} elseif (false !== $prefix && isset($this->nsStack[0][$prefix])) { |
421
|
6 |
|
$ele->setAttributeNS($this->nsStack[0][$prefix], $aName, $aVal); |
422
|
6 |
|
} else { |
423
|
77 |
|
$ele->setAttribute($aName, $aVal); |
424
|
|
|
} |
425
|
80 |
|
} catch (\DOMException $e) { |
426
|
|
|
$this->parseError("Illegal attribute name for tag $name. Ignoring: $aName"); |
427
|
|
|
continue; |
428
|
|
|
} |
429
|
|
|
|
430
|
|
|
// This is necessary on a non-DTD schema, like HTML5. |
431
|
80 |
|
if ('id' === $aName) { |
432
|
24 |
|
$ele->setIdAttribute('id', true); |
433
|
24 |
|
} |
434
|
110 |
|
} |
435
|
|
|
|
436
|
110 |
|
if ($this->frag !== $this->current && $this->rules->hasRules($name)) { |
437
|
|
|
// Some elements have special processing rules. Handle those separately. |
438
|
6 |
|
$this->current = $this->rules->evaluate($ele, $this->current); |
439
|
6 |
|
} else { |
440
|
|
|
// Otherwise, it's a standard element. |
441
|
110 |
|
$this->current->appendChild($ele); |
442
|
|
|
|
443
|
110 |
|
if (!Elements::isA($name, Elements::VOID_TAG)) { |
444
|
110 |
|
$this->current = $ele; |
445
|
110 |
|
} |
446
|
|
|
|
447
|
|
|
// Self-closing tags should only be respected on foreign elements |
448
|
|
|
// (and are implied on void elements) |
449
|
|
|
// See: https://www.w3.org/TR/html5/syntax.html#start-tags |
450
|
110 |
|
if (Elements::isHtml5Element($name)) { |
451
|
109 |
|
$selfClosing = false; |
452
|
109 |
|
} |
453
|
|
|
} |
454
|
|
|
|
455
|
|
|
// This is sort of a last-ditch attempt to correct for cases where no head/body |
456
|
|
|
// elements are provided. |
457
|
110 |
|
if ($this->insertMode <= static::IM_BEFORE_HEAD && 'head' !== $name && 'html' !== $name) { |
458
|
5 |
|
$this->insertMode = static::IM_IN_BODY; |
459
|
5 |
|
} |
460
|
|
|
|
461
|
|
|
// When we are on a void tag, we do not need to care about namesapce nesting, |
462
|
|
|
// but we have to remove the namespaces pushed to $nsStack. |
463
|
110 |
|
if ($pushes > 0 && Elements::isA($name, Elements::VOID_TAG)) { |
464
|
|
|
// remove the namespaced definded by current node |
465
|
|
|
for ($i = 0; $i < $pushes; ++$i) { |
466
|
|
|
array_shift($this->nsStack); |
467
|
|
|
} |
468
|
|
|
} |
469
|
|
|
|
470
|
110 |
|
if ($selfClosing) { |
471
|
7 |
|
$this->endTag($name); |
472
|
7 |
|
} |
473
|
|
|
|
474
|
|
|
// Return the element mask, which the tokenizer can then use to set |
475
|
|
|
// various processing rules. |
476
|
110 |
|
return Elements::element($name); |
477
|
|
|
} |
478
|
|
|
|
479
|
108 |
|
public function endTag($name) |
480
|
|
|
{ |
481
|
108 |
|
$lname = $this->normalizeTagName($name); |
482
|
|
|
|
483
|
|
|
// Ignore closing tags for unary elements. |
484
|
108 |
|
if (Elements::isA($name, Elements::VOID_TAG)) { |
485
|
|
|
return; |
486
|
|
|
} |
487
|
|
|
|
488
|
108 |
|
if ($this->insertMode <= static::IM_BEFORE_HTML) { |
489
|
|
|
// 8.2.5.4.2 |
490
|
|
|
if (in_array($name, array( |
491
|
|
|
'html', |
492
|
|
|
'br', |
493
|
|
|
'head', |
494
|
|
|
'title', |
495
|
|
|
))) { |
496
|
|
|
$this->startTag('html'); |
497
|
|
|
$this->endTag($name); |
498
|
|
|
$this->insertMode = static::IM_BEFORE_HEAD; |
499
|
|
|
|
500
|
|
|
return; |
501
|
|
|
} |
502
|
|
|
|
503
|
|
|
// Ignore the tag. |
504
|
|
|
$this->parseError('Illegal closing tag at global scope.'); |
505
|
|
|
|
506
|
|
|
return; |
507
|
|
|
} |
508
|
|
|
|
509
|
|
|
// Special case handling for SVG. |
510
|
108 |
|
if ($this->insertMode === static::IM_IN_SVG) { |
511
|
8 |
|
$lname = Elements::normalizeSvgElement($lname); |
512
|
8 |
|
} |
513
|
|
|
|
514
|
108 |
|
$cid = spl_object_hash($this->current); |
515
|
|
|
|
516
|
|
|
// XXX: HTML has no parent. What do we do, though, |
517
|
|
|
// if this element appears in the wrong place? |
518
|
108 |
|
if ('html' === $lname) { |
519
|
99 |
|
return; |
520
|
|
|
} |
521
|
|
|
|
522
|
|
|
// remove the namespaced definded by current node |
523
|
102 |
|
if (isset($this->pushes[$cid])) { |
524
|
15 |
|
for ($i = 0; $i < $this->pushes[$cid][0]; ++$i) { |
525
|
15 |
|
array_shift($this->nsStack); |
526
|
15 |
|
} |
527
|
15 |
|
unset($this->pushes[$cid]); |
528
|
15 |
|
} |
529
|
|
|
|
530
|
102 |
|
if (!$this->autoclose($lname)) { |
531
|
2 |
|
$this->parseError('Could not find closing tag for ' . $lname); |
532
|
2 |
|
} |
533
|
|
|
|
534
|
|
|
switch ($lname) { |
535
|
102 |
|
case 'head': |
536
|
45 |
|
$this->insertMode = static::IM_AFTER_HEAD; |
537
|
45 |
|
break; |
538
|
100 |
|
case 'body': |
539
|
88 |
|
$this->insertMode = static::IM_AFTER_BODY; |
540
|
88 |
|
break; |
541
|
84 |
|
case 'svg': |
542
|
84 |
|
case 'mathml': |
543
|
8 |
|
$this->insertMode = static::IM_IN_BODY; |
544
|
8 |
|
break; |
545
|
|
|
} |
546
|
102 |
|
} |
547
|
|
|
|
548
|
5 |
|
public function comment($cdata) |
549
|
|
|
{ |
550
|
|
|
// TODO: Need to handle case where comment appears outside of the HTML tag. |
551
|
5 |
|
$node = $this->doc->createComment($cdata); |
552
|
5 |
|
$this->current->appendChild($node); |
553
|
5 |
|
} |
554
|
|
|
|
555
|
92 |
|
public function text($data) |
556
|
|
|
{ |
557
|
|
|
// https://html.spec.whatwg.org/multipage/parsing.html#the-before-head-insertion-mode |
558
|
92 |
|
if ($this->insertMode < static::IM_IN_HEAD) { |
559
|
|
|
// Per '8.2.5.4.3 The "before head" insertion mode' the characters |
560
|
|
|
// " \t\n\r\f" should be ignored . |
561
|
60 |
|
$dataTmp = trim($data, " \t\n\r\f"); |
562
|
60 |
|
if (! empty($dataTmp)) { |
563
|
2 |
|
$this->startTag('head'); |
564
|
2 |
|
$this->endTag('head'); |
565
|
2 |
|
$this->startTag('body'); |
566
|
2 |
|
} else { |
567
|
58 |
|
return; |
568
|
|
|
} |
569
|
2 |
|
} |
570
|
|
|
|
571
|
91 |
|
$node = $this->doc->createTextNode($data); |
572
|
91 |
|
$this->current->appendChild($node); |
573
|
91 |
|
} |
574
|
|
|
|
575
|
115 |
|
public function eof() |
576
|
|
|
{ |
577
|
|
|
// If the $current isn't the $root, do we need to do anything? |
578
|
115 |
|
} |
579
|
|
|
|
580
|
13 |
|
public function parseError($msg, $line = 0, $col = 0) |
581
|
|
|
{ |
582
|
13 |
|
$this->errors[] = sprintf('Line %d, Col %d: %s', $line, $col, $msg); |
583
|
13 |
|
} |
584
|
|
|
|
585
|
109 |
|
public function getErrors() |
586
|
|
|
{ |
587
|
109 |
|
return $this->errors; |
588
|
|
|
} |
589
|
|
|
|
590
|
3 |
|
public function cdata($data) |
591
|
|
|
{ |
592
|
3 |
|
$node = $this->doc->createCDATASection($data); |
593
|
3 |
|
$this->current->appendChild($node); |
594
|
3 |
|
} |
595
|
|
|
|
596
|
5 |
|
public function processingInstruction($name, $data = null) |
597
|
|
|
{ |
598
|
|
|
// XXX: Ignore initial XML declaration, per the spec. |
599
|
5 |
|
if ($this->insertMode === static::IM_INITIAL && 'xml' === strtolower($name)) { |
600
|
1 |
|
return; |
601
|
|
|
} |
602
|
|
|
|
603
|
|
|
// Important: The processor may modify the current DOM tree however it sees fit. |
604
|
5 |
|
if ($this->processor instanceof InstructionProcessor) { |
605
|
1 |
|
$res = $this->processor->process($this->current, $name, $data); |
|
|
|
|
606
|
1 |
|
if (!empty($res)) { |
607
|
1 |
|
$this->current = $res; |
608
|
1 |
|
} |
609
|
|
|
|
610
|
1 |
|
return; |
611
|
|
|
} |
612
|
|
|
|
613
|
|
|
// Otherwise, this is just a dumb PI element. |
614
|
4 |
|
$node = $this->doc->createProcessingInstruction($name, $data); |
615
|
|
|
|
616
|
4 |
|
$this->current->appendChild($node); |
617
|
4 |
|
} |
618
|
|
|
|
619
|
|
|
// ========================================================================== |
620
|
|
|
// UTILITIES |
621
|
|
|
// ========================================================================== |
622
|
|
|
|
623
|
|
|
/** |
624
|
|
|
* Apply normalization rules to a tag name. |
625
|
|
|
* See sections 2.9 and 8.1.2. |
626
|
|
|
* |
627
|
|
|
* @param string $tagName |
628
|
|
|
* |
629
|
|
|
* @return string The normalized tag name. |
630
|
|
|
*/ |
631
|
110 |
|
protected function normalizeTagName($tagName) |
632
|
|
|
{ |
633
|
|
|
/* |
634
|
|
|
* Section 2.9 suggests that we should not do this. if (strpos($name, ':') !== false) { // We know from the grammar that there must be at least one other // char besides :, since : is not a legal tag start. $parts = explode(':', $name); return array_pop($parts); } |
635
|
|
|
*/ |
636
|
110 |
|
return $tagName; |
637
|
|
|
} |
638
|
|
|
|
639
|
|
|
protected function quirksTreeResolver($name) |
640
|
|
|
{ |
641
|
|
|
throw new \Exception('Not implemented.'); |
642
|
|
|
} |
643
|
|
|
|
644
|
|
|
/** |
645
|
|
|
* Automatically climb the tree and close the closest node with the matching $tag. |
646
|
|
|
* |
647
|
|
|
* @param string $tagName |
648
|
|
|
* |
649
|
|
|
* @return bool |
650
|
|
|
*/ |
651
|
102 |
|
protected function autoclose($tagName) |
652
|
|
|
{ |
653
|
102 |
|
$working = $this->current; |
654
|
|
|
do { |
655
|
102 |
|
if (XML_ELEMENT_NODE !== $working->nodeType) { |
656
|
56 |
|
return false; |
657
|
|
|
} |
658
|
102 |
|
if ($working->tagName === $tagName) { |
659
|
102 |
|
$this->current = $working->parentNode; |
660
|
|
|
|
661
|
102 |
|
return true; |
662
|
|
|
} |
663
|
53 |
|
} while ($working = $working->parentNode); |
664
|
|
|
|
665
|
|
|
return false; |
666
|
|
|
} |
667
|
|
|
|
668
|
|
|
/** |
669
|
|
|
* Checks if the given tagname is an ancestor of the present candidate. |
670
|
|
|
* |
671
|
|
|
* If $this->current or anything above $this->current matches the given tag |
672
|
|
|
* name, this returns true. |
673
|
|
|
* |
674
|
|
|
* @param string $tagName |
675
|
|
|
* |
676
|
|
|
* @return bool |
677
|
|
|
*/ |
678
|
|
|
protected function isAncestor($tagName) |
679
|
|
|
{ |
680
|
|
|
$candidate = $this->current; |
681
|
|
|
while (XML_ELEMENT_NODE === $candidate->nodeType) { |
682
|
|
|
if ($candidate->tagName === $tagName) { |
683
|
|
|
return true; |
684
|
|
|
} |
685
|
|
|
$candidate = $candidate->parentNode; |
686
|
|
|
} |
687
|
|
|
|
688
|
|
|
return false; |
689
|
|
|
} |
690
|
|
|
|
691
|
|
|
/** |
692
|
|
|
* Returns true if the immediate parent element is of the given tagname. |
693
|
|
|
* |
694
|
|
|
* @param string $tagName |
695
|
|
|
* |
696
|
|
|
* @return bool |
697
|
|
|
*/ |
698
|
|
|
protected function isParent($tagName) |
699
|
|
|
{ |
700
|
|
|
return $this->current->tagName === $tagName; |
701
|
|
|
} |
702
|
|
|
} |
703
|
|
|
|
If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:
If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.