1
|
|
|
<?php |
2
|
|
|
namespace Masterminds\Html5\Parser; |
3
|
|
|
|
4
|
|
|
use Masterminds\Html5\Elements; |
5
|
|
|
use Psr\Log\LoggerInterface; |
6
|
|
|
use Psr\Log\NullLogger; |
7
|
|
|
|
8
|
|
|
/** |
9
|
|
|
* Create an HTML5 DOM tree from events. |
10
|
|
|
* |
11
|
|
|
* This attempts to create a DOM from events emitted by a parser. This |
12
|
|
|
* attempts (but does not guarantee) to up-convert older HTML documents |
13
|
|
|
* to HTML5. It does this by applying HTML5's rules, but it will not |
14
|
|
|
* change the architecture of the document itself. |
15
|
|
|
* |
16
|
|
|
* Many of the error correction and quirks features suggested in the specification |
17
|
|
|
* are implemented herein; however, not all of them are. Since we do not |
18
|
|
|
* assume a graphical user agent, no presentation-specific logic is conducted |
19
|
|
|
* during tree building. |
20
|
|
|
* |
21
|
|
|
* FIXME: The present tree builder does not exactly follow the state machine rules |
22
|
|
|
* for insert modes as outlined in the HTML5 spec. The processor needs to be |
23
|
|
|
* re-written to accomodate this. See, for example, the Go language HTML5 |
24
|
|
|
* parser. |
25
|
|
|
*/ |
26
|
|
|
class DOMTreeBuilder implements EventHandler |
27
|
|
|
{ |
28
|
|
|
/** |
29
|
|
|
* Defined in http://www.w3.org/TR/html51/infrastructure.html#html-namespace-0 |
30
|
|
|
*/ |
31
|
|
|
const NAMESPACE_HTML = 'http://www.w3.org/1999/xhtml'; |
32
|
|
|
|
33
|
|
|
const NAMESPACE_MATHML = 'http://www.w3.org/1998/Math/MathML'; |
34
|
|
|
|
35
|
|
|
const NAMESPACE_SVG = 'http://www.w3.org/2000/svg'; |
36
|
|
|
|
37
|
|
|
const NAMESPACE_XLINK = 'http://www.w3.org/1999/xlink'; |
38
|
|
|
|
39
|
|
|
const NAMESPACE_XML = 'http://www.w3.org/XML/1998/namespace'; |
40
|
|
|
|
41
|
|
|
const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/'; |
42
|
|
|
|
43
|
|
|
const OPT_DISABLE_HTML_NS = 'disable_html_ns'; |
44
|
|
|
|
45
|
|
|
const OPT_TARGET_DOC = 'target_document'; |
46
|
|
|
|
47
|
|
|
const OPT_IMPLICIT_NS = 'implicit_namespaces'; |
48
|
|
|
|
49
|
|
|
/** |
50
|
|
|
* Holds the HTML5 element names that causes a namespace switch |
51
|
|
|
* |
52
|
|
|
* @var array |
53
|
|
|
*/ |
54
|
|
|
protected $nsRoots = array( |
55
|
|
|
'html' => self::NAMESPACE_HTML, |
56
|
|
|
'svg' => self::NAMESPACE_SVG, |
57
|
|
|
'math' => self::NAMESPACE_MATHML |
58
|
|
|
); |
59
|
|
|
|
60
|
|
|
/** |
61
|
|
|
* @var LoggerInterface |
62
|
|
|
*/ |
63
|
|
|
protected $logger; |
64
|
|
|
|
65
|
|
|
/** |
66
|
|
|
* Holds the always available namespaces (which does not require the XMLNS declaration). |
67
|
|
|
* |
68
|
|
|
* @var array |
69
|
|
|
*/ |
70
|
|
|
protected $implicitNamespaces = array( |
71
|
|
|
'xml' => self::NAMESPACE_XML, |
72
|
|
|
'xmlns' => self::NAMESPACE_XMLNS, |
73
|
|
|
'xlink' => self::NAMESPACE_XLINK |
74
|
|
|
); |
75
|
|
|
|
76
|
|
|
/** |
77
|
|
|
* Holds a stack of currently active namespaces. |
78
|
|
|
* |
79
|
|
|
* @var array |
80
|
|
|
*/ |
81
|
|
|
protected $nsStack = array(); |
82
|
|
|
|
83
|
|
|
/** |
84
|
|
|
* Holds the number of namespaces declared by a node. |
85
|
|
|
* |
86
|
|
|
* @var array |
87
|
|
|
*/ |
88
|
|
|
protected $pushes = array(); |
89
|
|
|
|
90
|
|
|
/** |
91
|
|
|
* Defined in 8.2.5. |
92
|
|
|
*/ |
93
|
|
|
const IM_INITIAL = 0; |
94
|
|
|
|
95
|
|
|
const IM_BEFORE_HTML = 1; |
96
|
|
|
|
97
|
|
|
const IM_BEFORE_HEAD = 2; |
98
|
|
|
|
99
|
|
|
const IM_IN_HEAD = 3; |
100
|
|
|
|
101
|
|
|
const IM_IN_HEAD_NOSCRIPT = 4; |
102
|
|
|
|
103
|
|
|
const IM_AFTER_HEAD = 5; |
104
|
|
|
|
105
|
|
|
const IM_IN_BODY = 6; |
106
|
|
|
|
107
|
|
|
const IM_TEXT = 7; |
108
|
|
|
|
109
|
|
|
const IM_IN_TABLE = 8; |
110
|
|
|
|
111
|
|
|
const IM_IN_TABLE_TEXT = 9; |
112
|
|
|
|
113
|
|
|
const IM_IN_CAPTION = 10; |
114
|
|
|
|
115
|
|
|
const IM_IN_COLUMN_GROUP = 11; |
116
|
|
|
|
117
|
|
|
const IM_IN_TABLE_BODY = 12; |
118
|
|
|
|
119
|
|
|
const IM_IN_ROW = 13; |
120
|
|
|
|
121
|
|
|
const IM_IN_CELL = 14; |
122
|
|
|
|
123
|
|
|
const IM_IN_SELECT = 15; |
124
|
|
|
|
125
|
|
|
const IM_IN_SELECT_IN_TABLE = 16; |
126
|
|
|
|
127
|
|
|
const IM_AFTER_BODY = 17; |
128
|
|
|
|
129
|
|
|
const IM_IN_FRAMESET = 18; |
130
|
|
|
|
131
|
|
|
const IM_AFTER_FRAMESET = 19; |
132
|
|
|
|
133
|
|
|
const IM_AFTER_AFTER_BODY = 20; |
134
|
|
|
|
135
|
|
|
const IM_AFTER_AFTER_FRAMESET = 21; |
136
|
|
|
|
137
|
|
|
const IM_IN_SVG = 22; |
138
|
|
|
|
139
|
|
|
const IM_IN_MATHML = 23; |
140
|
|
|
|
141
|
|
|
protected $options = array(); |
142
|
|
|
|
143
|
|
|
protected $stack = array(); |
144
|
|
|
|
145
|
|
|
protected $current; // Pointer in the tag hierarchy. |
146
|
|
|
protected $doc; |
147
|
|
|
|
148
|
|
|
protected $frag; |
149
|
|
|
|
150
|
|
|
protected $processor; |
151
|
|
|
|
152
|
|
|
protected $insertMode = 0; |
153
|
|
|
|
154
|
|
|
/** |
155
|
|
|
* Track if we are in an element that allows only inline child nodes |
156
|
|
|
* @var string|null |
157
|
|
|
*/ |
158
|
|
|
protected $onlyInline; |
159
|
|
|
|
160
|
|
|
/** |
161
|
|
|
* Quirks mode is enabled by default. |
162
|
|
|
* Any document that is missing the |
163
|
|
|
* DT will be considered to be in quirks mode. |
164
|
|
|
*/ |
165
|
|
|
protected $quirks = true; |
166
|
|
|
|
167
|
|
|
protected $errors = array(); |
168
|
|
|
|
169
|
96 |
|
public function __construct($isFragment = false, array $options = array(), LoggerInterface $logger = null) |
170
|
|
|
{ |
171
|
96 |
|
$this->options = $options; |
172
|
96 |
|
$this->logger = $logger ?: new NullLogger(); |
173
|
|
|
|
174
|
96 |
|
if (isset($options[self::OPT_TARGET_DOC])) { |
175
|
1 |
|
$this->doc = $options[self::OPT_TARGET_DOC]; |
176
|
1 |
|
} else { |
177
|
95 |
|
$impl = new \DOMImplementation(); |
178
|
|
|
// XXX: |
179
|
|
|
// Create the doctype. For now, we are always creating HTML5 |
180
|
|
|
// documents, and attempting to up-convert any older DTDs to HTML5. |
181
|
95 |
|
$dt = $impl->createDocumentType('html'); |
182
|
|
|
// $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt); |
183
|
95 |
|
$this->doc = $impl->createDocument(null, null, $dt); |
184
|
|
|
} |
185
|
96 |
|
$this->errors = array(); |
186
|
|
|
|
187
|
96 |
|
$this->current = $this->doc; // ->documentElement; |
188
|
|
|
|
189
|
|
|
// Create a rules engine for tags. |
190
|
96 |
|
$this->rules = new TreeBuildingRules($this->doc); |
191
|
|
|
|
192
|
96 |
|
$implicitNS = array(); |
193
|
96 |
|
if (isset($this->options[self::OPT_IMPLICIT_NS])) { |
194
|
|
|
$implicitNS = $this->options[self::OPT_IMPLICIT_NS]; |
195
|
96 |
|
} elseif (isset($this->options["implicitNamespaces"])) { |
196
|
2 |
|
$implicitNS = $this->options["implicitNamespaces"]; |
197
|
2 |
|
} |
198
|
|
|
|
199
|
|
|
// Fill $nsStack with the defalut HTML5 namespaces, plus the "implicitNamespaces" array taken form $options |
200
|
96 |
|
array_unshift($this->nsStack, $implicitNS + array( |
201
|
|
|
'' => self::NAMESPACE_HTML |
202
|
96 |
|
) + $this->implicitNamespaces); |
203
|
|
|
|
204
|
96 |
|
if ($isFragment) { |
205
|
16 |
|
$this->insertMode = static::IM_IN_BODY; |
206
|
16 |
|
$this->frag = $this->doc->createDocumentFragment(); |
207
|
16 |
|
$this->current = $this->frag; |
208
|
16 |
|
} |
209
|
96 |
|
} |
210
|
|
|
|
211
|
|
|
/** |
212
|
|
|
* Get the document. |
213
|
|
|
*/ |
214
|
88 |
|
public function document() |
215
|
|
|
{ |
216
|
88 |
|
return $this->doc; |
217
|
|
|
} |
218
|
|
|
|
219
|
|
|
/** |
220
|
|
|
* Get the DOM fragment for the body. |
221
|
|
|
* |
222
|
|
|
* This returns a DOMNodeList because a fragment may have zero or more |
223
|
|
|
* DOMNodes at its root. |
224
|
|
|
* |
225
|
|
|
* @see http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#concept-frag-parse-context |
226
|
|
|
* |
227
|
|
|
* @return \DOMFragmentDocumentFragment |
228
|
|
|
*/ |
229
|
16 |
|
public function fragment() |
230
|
|
|
{ |
231
|
16 |
|
return $this->frag; |
232
|
|
|
} |
233
|
|
|
|
234
|
|
|
/** |
235
|
|
|
* Provide an instruction processor. |
236
|
|
|
* |
237
|
|
|
* This is used for handling Processor Instructions as they are |
238
|
|
|
* inserted. If omitted, PI's are inserted directly into the DOM tree. |
239
|
|
|
*/ |
240
|
1 |
|
public function setInstructionProcessor(\Masterminds\Html5\InstructionProcessor $proc) |
241
|
|
|
{ |
242
|
1 |
|
$this->processor = $proc; |
243
|
1 |
|
} |
244
|
|
|
|
245
|
83 |
|
public function doctype($name, $idType = 0, $id = null, $quirks = false) |
246
|
|
|
{ |
247
|
|
|
// This is used solely for setting quirks mode. Currently we don't |
248
|
|
|
// try to preserve the inbound DT. We convert it to HTML5. |
249
|
83 |
|
$this->quirks = $quirks; |
250
|
|
|
|
251
|
83 |
|
if ($this->insertMode > static::IM_INITIAL) { |
252
|
|
|
$this->parseError("Illegal placement of DOCTYPE tag. Ignoring: " . $name); |
253
|
|
|
|
254
|
|
|
return; |
255
|
|
|
} |
256
|
|
|
|
257
|
83 |
|
$this->insertMode = static::IM_BEFORE_HTML; |
258
|
83 |
|
} |
259
|
|
|
|
260
|
|
|
/** |
261
|
|
|
* Process the start tag. |
262
|
|
|
* |
263
|
|
|
* @todo - XMLNS namespace handling (we need to parse, even if it's not valid) |
264
|
|
|
* - XLink, MathML and SVG namespace handling |
265
|
|
|
* - Omission rules: 8.1.2.4 Optional tags |
266
|
|
|
*/ |
267
|
93 |
|
public function startTag($name, $attributes = array(), $selfClosing = false) |
268
|
|
|
{ |
269
|
93 |
|
$this->logger->debug("Starting tag $name"); |
270
|
93 |
|
$lname = $this->normalizeTagName($name); |
271
|
|
|
|
272
|
|
|
// Make sure we have an html element. |
273
|
93 |
|
if (! $this->doc->documentElement && $name !== 'html' && ! $this->frag) { |
274
|
3 |
|
$this->startTag('html'); |
275
|
3 |
|
} |
276
|
|
|
|
277
|
|
|
// Set quirks mode if we're at IM_INITIAL with no doctype. |
278
|
93 |
|
if ($this->insertMode == static::IM_INITIAL) { |
279
|
4 |
|
$this->quirks = true; |
280
|
4 |
|
$this->parseError("No DOCTYPE specified."); |
281
|
4 |
|
} |
282
|
|
|
|
283
|
|
|
// SPECIAL TAG HANDLING: |
284
|
|
|
// Spec says do this, and "don't ask." |
285
|
93 |
|
if ($name == 'image') { |
286
|
|
|
$name = 'img'; |
287
|
|
|
} |
288
|
|
|
|
289
|
|
|
// Autoclose p tags where appropriate. |
290
|
93 |
|
if ($this->insertMode >= static::IM_IN_BODY && Elements::isA($name, Elements::AUTOCLOSE_P)) { |
291
|
50 |
|
$this->autoclose('p'); |
292
|
50 |
|
} |
293
|
|
|
|
294
|
|
|
// Set insert mode: |
295
|
|
|
switch ($name) { |
296
|
93 |
|
case 'html': |
297
|
87 |
|
$this->insertMode = static::IM_BEFORE_HEAD; |
298
|
87 |
|
break; |
299
|
87 |
|
case 'head': |
300
|
35 |
|
if ($this->insertMode > static::IM_BEFORE_HEAD) { |
301
|
|
|
$this->parseError("Unexpected head tag outside of head context."); |
302
|
|
|
} else { |
303
|
35 |
|
$this->insertMode = static::IM_IN_HEAD; |
304
|
|
|
} |
305
|
35 |
|
break; |
306
|
86 |
|
case 'body': |
307
|
71 |
|
$this->insertMode = static::IM_IN_BODY; |
308
|
71 |
|
break; |
309
|
81 |
|
case 'svg': |
310
|
7 |
|
$this->insertMode = static::IM_IN_SVG; |
311
|
7 |
|
break; |
312
|
81 |
|
case 'math': |
313
|
7 |
|
$this->insertMode = static::IM_IN_MATHML; |
314
|
7 |
|
break; |
315
|
78 |
|
case 'noscript': |
316
|
1 |
|
if ($this->insertMode == static::IM_IN_HEAD) { |
317
|
1 |
|
$this->insertMode = static::IM_IN_HEAD_NOSCRIPT; |
318
|
1 |
|
} |
319
|
1 |
|
break; |
320
|
|
|
} |
321
|
|
|
|
322
|
|
|
// Special case handling for SVG. |
323
|
93 |
|
if ($this->insertMode == static::IM_IN_SVG) { |
324
|
7 |
|
$lname = Elements::normalizeSvgElement($lname); |
325
|
7 |
|
} |
326
|
|
|
|
327
|
93 |
|
$pushes = 0; |
328
|
|
|
// when we found a tag thats appears inside $nsRoots, we have to switch the defalut namespace |
329
|
93 |
|
if (isset($this->nsRoots[$lname]) && $this->nsStack[0][''] !== $this->nsRoots[$lname]) { |
330
|
14 |
|
array_unshift($this->nsStack, array( |
331
|
14 |
|
'' => $this->nsRoots[$lname] |
332
|
14 |
|
) + $this->nsStack[0]); |
333
|
14 |
|
$pushes ++; |
334
|
14 |
|
} |
335
|
93 |
|
$needsWorkaround = false; |
336
|
93 |
|
if (isset($this->options["xmlNamespaces"]) && $this->options["xmlNamespaces"]) { |
337
|
|
|
// when xmlNamespaces is true a and we found a 'xmlns' or 'xmlns:*' attribute, we should add a new item to the $nsStack |
338
|
5 |
|
foreach ($attributes as $aName => $aVal) { |
339
|
4 |
|
if ($aName === 'xmlns') { |
340
|
3 |
|
$needsWorkaround = $aVal; |
341
|
3 |
|
array_unshift($this->nsStack, array( |
342
|
|
|
'' => $aVal |
343
|
3 |
|
) + $this->nsStack[0]); |
344
|
3 |
|
$pushes ++; |
345
|
4 |
|
} elseif ((($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : '') === 'xmlns') { |
346
|
3 |
|
array_unshift($this->nsStack, array( |
347
|
3 |
|
substr($aName, $pos + 1) => $aVal |
348
|
3 |
|
) + $this->nsStack[0]); |
349
|
3 |
|
$pushes ++; |
350
|
3 |
|
} |
351
|
5 |
|
} |
352
|
5 |
|
} |
353
|
|
|
|
354
|
93 |
|
if ($this->onlyInline && Elements::isA($lname, Elements::BLOCK_TAG)) { |
355
|
2 |
|
$this->autoclose($this->onlyInline); |
356
|
2 |
|
$this->onlyInline = null; |
357
|
2 |
|
} |
358
|
|
|
|
359
|
|
|
try { |
360
|
93 |
|
$prefix = ($pos = strpos($lname, ':')) ? substr($lname, 0, $pos) : ''; |
361
|
|
|
|
362
|
|
|
|
363
|
93 |
|
if ($needsWorkaround!==false) { |
364
|
|
|
|
365
|
3 |
|
$xml = "<$lname xmlns=\"$needsWorkaround\" ".(strlen($prefix) && isset($this->nsStack[0][$prefix])?("xmlns:$prefix=\"".$this->nsStack[0][$prefix]."\""):"")."/>"; |
366
|
|
|
|
367
|
3 |
|
$frag = new \DOMDocument('1.0', 'UTF-8'); |
368
|
3 |
|
$frag->loadXML($xml); |
369
|
|
|
|
370
|
3 |
|
$ele = $this->doc->importNode($frag->documentElement, true); |
371
|
|
|
|
372
|
3 |
|
} else { |
373
|
93 |
|
if (!isset($this->nsStack[0][$prefix]) || ($prefix === "" && isset($this->options[self::OPT_DISABLE_HTML_NS]) && $this->options[self::OPT_DISABLE_HTML_NS])) { |
374
|
2 |
|
$ele = $this->doc->createElement($lname); |
375
|
2 |
|
} else { |
376
|
92 |
|
$ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname); |
377
|
|
|
} |
378
|
|
|
} |
379
|
|
|
|
380
|
93 |
|
} catch (\DOMException $e) { |
381
|
|
|
$this->parseError("Illegal tag name: <$lname>. Replaced with <invalid>."); |
382
|
|
|
$ele = $this->doc->createElement('invalid'); |
383
|
|
|
} |
384
|
|
|
|
385
|
93 |
|
if (Elements::isA($lname, Elements::BLOCK_ONLY_INLINE)) { |
386
|
25 |
|
$this->onlyInline = $lname; |
387
|
25 |
|
} |
388
|
|
|
|
389
|
|
|
// When we add some namespacess, we have to track them. Later, when "endElement" is invoked, we have to remove them. |
390
|
|
|
// When we are on a void tag, we do not need to care about namesapce nesting. |
391
|
93 |
|
if ($pushes > 0 && !Elements::isA($name, Elements::VOID_TAG)) { |
392
|
|
|
// PHP tends to free the memory used by DOM, |
393
|
|
|
// to avoid spl_object_hash collisions whe have to avoid garbage collection of $ele storing it into $pushes |
394
|
|
|
// see https://bugs.php.net/bug.php?id=67459 |
395
|
16 |
|
$this->pushes[spl_object_hash($ele)] = array($pushes, $ele); |
396
|
|
|
|
397
|
|
|
// SEE https://github.com/facebook/hhvm/issues/2962 |
398
|
16 |
|
if (defined('HHVM_VERSION')) { |
399
|
|
|
$ele->setAttribute('html5-php-fake-id-attribute', spl_object_hash($ele)); |
400
|
|
|
} |
401
|
16 |
|
} |
402
|
|
|
|
403
|
93 |
|
foreach ($attributes as $aName => $aVal) { |
404
|
|
|
// xmlns attributes can't be set |
405
|
68 |
|
if ($aName === 'xmlns') { |
406
|
5 |
|
continue; |
407
|
|
|
} |
408
|
|
|
|
409
|
67 |
|
if ($this->insertMode == static::IM_IN_SVG) { |
410
|
7 |
|
$aName = Elements::normalizeSvgAttribute($aName); |
411
|
67 |
|
} elseif ($this->insertMode == static::IM_IN_MATHML) { |
412
|
4 |
|
$aName = Elements::normalizeMathMlAttribute($aName); |
413
|
4 |
|
} |
414
|
|
|
|
415
|
|
|
try { |
416
|
67 |
|
$prefix = ($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : false; |
417
|
|
|
|
418
|
67 |
|
if ($prefix==='xmlns') { |
419
|
4 |
|
$ele->setAttributeNs(self::NAMESPACE_XMLNS, $aName, $aVal); |
420
|
67 |
|
} elseif ($prefix!==false && isset($this->nsStack[0][$prefix])) { |
421
|
6 |
|
$ele->setAttributeNs($this->nsStack[0][$prefix], $aName, $aVal); |
422
|
6 |
|
} else { |
423
|
64 |
|
$ele->setAttribute($aName, $aVal); |
424
|
|
|
} |
425
|
67 |
|
} catch (\DOMException $e) { |
426
|
|
|
$this->parseError("Illegal attribute name for tag $name. Ignoring: $aName"); |
427
|
|
|
continue; |
428
|
|
|
} |
429
|
|
|
|
430
|
|
|
// This is necessary on a non-DTD schema, like HTML5. |
431
|
67 |
|
if ($aName == 'id') { |
432
|
21 |
|
$ele->setIdAttribute('id', true); |
433
|
21 |
|
} |
434
|
93 |
|
} |
435
|
|
|
|
436
|
|
|
// Some elements have special processing rules. Handle those separately. |
437
|
93 |
|
if ($this->rules->hasRules($name) && $this->frag !== $this->current) { |
438
|
4 |
|
$this->current = $this->rules->evaluate($ele, $this->current); |
439
|
4 |
|
} // Otherwise, it's a standard element. |
440
|
|
|
else { |
441
|
93 |
|
$this->current->appendChild($ele); |
442
|
|
|
|
443
|
|
|
// XXX: Need to handle self-closing tags and unary tags. |
444
|
93 |
|
if (! Elements::isA($name, Elements::VOID_TAG)) { |
445
|
93 |
|
$this->current = $ele; |
446
|
93 |
|
} |
447
|
|
|
} |
448
|
|
|
|
449
|
|
|
// This is sort of a last-ditch attempt to correct for cases where no head/body |
450
|
|
|
// elements are provided. |
451
|
93 |
|
if ($this->insertMode <= static::IM_BEFORE_HEAD && $name != 'head' && $name != 'html') { |
452
|
5 |
|
$this->insertMode = static::IM_IN_BODY; |
453
|
5 |
|
} |
454
|
|
|
|
455
|
|
|
// When we are on a void tag, we do not need to care about namesapce nesting, |
456
|
|
|
// but we have to remove the namespaces pushed to $nsStack. |
457
|
93 |
|
if ($pushes > 0 && Elements::isA($name, Elements::VOID_TAG)) { |
458
|
|
|
// remove the namespaced definded by current node |
459
|
|
|
for ($i = 0; $i < $pushes; $i ++) { |
460
|
|
|
array_shift($this->nsStack); |
461
|
|
|
} |
462
|
|
|
} |
463
|
|
|
// Return the element mask, which the tokenizer can then use to set |
464
|
|
|
// various processing rules. |
465
|
93 |
|
return Elements::element($name); |
466
|
|
|
} |
467
|
|
|
|
468
|
91 |
|
public function endTag($name) |
469
|
|
|
{ |
470
|
91 |
|
$lname = $this->normalizeTagName($name); |
471
|
|
|
|
472
|
|
|
// Ignore closing tags for unary elements. |
473
|
91 |
|
if (Elements::isA($name, Elements::VOID_TAG)) { |
474
|
1 |
|
return; |
475
|
|
|
} |
476
|
|
|
|
477
|
91 |
|
if ($this->insertMode <= static::IM_BEFORE_HTML) { |
478
|
|
|
// 8.2.5.4.2 |
479
|
|
|
if (in_array($name, array( |
480
|
|
|
'html', |
481
|
|
|
'br', |
482
|
|
|
'head', |
483
|
|
|
'title' |
484
|
|
|
))) { |
485
|
|
|
$this->startTag('html'); |
486
|
|
|
$this->endTag($name); |
487
|
|
|
$this->insertMode = static::IM_BEFORE_HEAD; |
488
|
|
|
|
489
|
|
|
return; |
490
|
|
|
} |
491
|
|
|
|
492
|
|
|
// Ignore the tag. |
493
|
|
|
$this->parseError("Illegal closing tag at global scope."); |
494
|
|
|
|
495
|
|
|
return; |
496
|
|
|
} |
497
|
|
|
|
498
|
|
|
// Special case handling for SVG. |
499
|
91 |
|
if ($this->insertMode == static::IM_IN_SVG) { |
500
|
7 |
|
$lname = Elements::normalizeSvgElement($lname); |
501
|
7 |
|
} |
502
|
|
|
|
503
|
|
|
// See https://github.com/facebook/hhvm/issues/2962 |
504
|
91 |
|
if (defined('HHVM_VERSION') && ($cid = $this->current->getAttribute('html5-php-fake-id-attribute'))) { |
505
|
|
|
$this->current->removeAttribute('html5-php-fake-id-attribute'); |
506
|
|
|
} else { |
507
|
91 |
|
$cid = spl_object_hash($this->current); |
508
|
|
|
} |
509
|
|
|
|
510
|
|
|
// XXX: Not sure whether we need this anymore. |
511
|
|
|
// if ($name != $lname) { |
512
|
|
|
// return $this->quirksTreeResolver($lname); |
513
|
|
|
// } |
514
|
|
|
|
515
|
|
|
// XXX: HTML has no parent. What do we do, though, |
516
|
|
|
// if this element appears in the wrong place? |
517
|
91 |
|
if ($lname == 'html') { |
518
|
83 |
|
return; |
519
|
|
|
} |
520
|
|
|
|
521
|
|
|
// remove the namespaced definded by current node |
522
|
85 |
|
if (isset($this->pushes[$cid])) { |
523
|
14 |
|
for ($i = 0; $i < $this->pushes[$cid][0]; $i ++) { |
524
|
14 |
|
array_shift($this->nsStack); |
525
|
14 |
|
} |
526
|
14 |
|
unset($this->pushes[$cid]); |
527
|
14 |
|
} |
528
|
|
|
|
529
|
85 |
|
if (! $this->autoclose($lname)) { |
530
|
2 |
|
$this->parseError('Could not find closing tag for ' . $lname); |
531
|
2 |
|
} |
532
|
|
|
|
533
|
|
|
// switch ($this->insertMode) { |
534
|
|
|
switch ($lname) { |
535
|
85 |
|
case "head": |
536
|
35 |
|
$this->insertMode = static::IM_AFTER_HEAD; |
537
|
35 |
|
break; |
538
|
84 |
|
case "body": |
539
|
72 |
|
$this->insertMode = static::IM_AFTER_BODY; |
540
|
72 |
|
break; |
541
|
72 |
|
case "svg": |
542
|
72 |
|
case "mathml": |
543
|
7 |
|
$this->insertMode = static::IM_IN_BODY; |
544
|
7 |
|
break; |
545
|
|
|
} |
546
|
85 |
|
} |
547
|
|
|
|
548
|
5 |
|
public function comment($cdata) |
549
|
|
|
{ |
550
|
|
|
// TODO: Need to handle case where comment appears outside of the HTML tag. |
551
|
5 |
|
$node = $this->doc->createComment($cdata); |
552
|
5 |
|
$this->current->appendChild($node); |
553
|
5 |
|
} |
554
|
|
|
|
555
|
74 |
|
public function text($data) |
556
|
|
|
{ |
557
|
|
|
// XXX: Hmmm.... should we really be this strict? |
558
|
74 |
|
if ($this->insertMode < static::IM_IN_HEAD) { |
559
|
|
|
// Per '8.2.5.4.3 The "before head" insertion mode' the characters |
560
|
|
|
// " \t\n\r\f" should be ignored but no mention of a parse error. This is |
561
|
|
|
// practical as most documents contain these characters. Other text is not |
562
|
|
|
// expected here so recording a parse error is necessary. |
563
|
46 |
|
$dataTmp = trim($data, " \t\n\r\f"); |
564
|
46 |
|
if (! empty($dataTmp)) { |
565
|
1 |
|
$this->logger->debug(sprintf("Unexpected insert mode: %d", $this->insertMode)); |
566
|
1 |
|
$this->parseError("Unexpected text. Ignoring: " . $dataTmp); |
567
|
1 |
|
} |
568
|
|
|
|
569
|
46 |
|
return; |
570
|
|
|
} |
571
|
73 |
|
$node = $this->doc->createTextNode($data); |
572
|
73 |
|
$this->current->appendChild($node); |
573
|
73 |
|
} |
574
|
|
|
|
575
|
96 |
|
public function eof() |
576
|
|
|
{ |
577
|
|
|
// If the $current isn't the $root, do we need to do anything? |
578
|
96 |
|
} |
579
|
|
|
|
580
|
8 |
|
public function parseError($msg, $line = 0, $col = 0) |
581
|
|
|
{ |
582
|
8 |
|
$this->logger->error(sprintf("Line %d, Col %d: %s", $line, $col, $msg)); |
583
|
8 |
|
$this->errors[] = sprintf("Line %d, Col %d: %s", $line, $col, $msg); |
584
|
8 |
|
} |
585
|
|
|
|
586
|
91 |
|
public function getErrors() |
587
|
|
|
{ |
588
|
91 |
|
return $this->errors; |
589
|
|
|
} |
590
|
|
|
|
591
|
3 |
|
public function cdata($data) |
592
|
|
|
{ |
593
|
3 |
|
$node = $this->doc->createCDATASection($data); |
594
|
3 |
|
$this->current->appendChild($node); |
595
|
3 |
|
} |
596
|
|
|
|
597
|
4 |
|
public function processingInstruction($name, $data = null) |
598
|
|
|
{ |
599
|
|
|
// XXX: Ignore initial XML declaration, per the spec. |
600
|
4 |
|
if ($this->insertMode == static::IM_INITIAL && 'xml' == strtolower($name)) { |
601
|
1 |
|
return; |
602
|
|
|
} |
603
|
|
|
|
604
|
|
|
// Important: The processor may modify the current DOM tree however |
605
|
|
|
// it sees fit. |
606
|
4 |
|
if (isset($this->processor)) { |
607
|
1 |
|
$res = $this->processor->process($this->current, $name, $data); |
|
|
|
|
608
|
1 |
|
if (! empty($res)) { |
609
|
1 |
|
$this->current = $res; |
610
|
1 |
|
} |
611
|
|
|
|
612
|
1 |
|
return; |
613
|
|
|
} |
614
|
|
|
|
615
|
|
|
// Otherwise, this is just a dumb PI element. |
616
|
3 |
|
$node = $this->doc->createProcessingInstruction($name, $data); |
617
|
|
|
|
618
|
3 |
|
$this->current->appendChild($node); |
619
|
3 |
|
} |
620
|
|
|
|
621
|
|
|
// ========================================================================== |
622
|
|
|
// UTILITIES |
623
|
|
|
// ========================================================================== |
624
|
|
|
|
625
|
|
|
/** |
626
|
|
|
* Apply normalization rules to a tag name. |
627
|
|
|
* |
628
|
|
|
* See sections 2.9 and 8.1.2. |
629
|
|
|
* |
630
|
|
|
* @param string $name |
631
|
|
|
* The tag name. |
632
|
|
|
* @return string The normalized tag name. |
633
|
|
|
*/ |
634
|
93 |
|
protected function normalizeTagName($name) |
635
|
|
|
{ |
636
|
|
|
/* |
637
|
|
|
* Section 2.9 suggests that we should not do this. if (strpos($name, ':') !== false) { // We know from the grammar that there must be at least one other // char besides :, since : is not a legal tag start. $parts = explode(':', $name); return array_pop($parts); } |
638
|
|
|
*/ |
639
|
93 |
|
return $name; |
640
|
|
|
} |
641
|
|
|
|
642
|
|
|
protected function quirksTreeResolver($name) |
643
|
|
|
{ |
644
|
|
|
throw new \Exception("Not implemented."); |
645
|
|
|
} |
646
|
|
|
|
647
|
|
|
/** |
648
|
|
|
* Automatically climb the tree and close the closest node with the matching $tag. |
649
|
|
|
*/ |
650
|
85 |
|
protected function autoclose($tag) |
651
|
|
|
{ |
652
|
85 |
|
$working = $this->current; |
653
|
|
|
do { |
654
|
85 |
|
if ($working->nodeType != XML_ELEMENT_NODE) { |
655
|
50 |
|
return false; |
656
|
|
|
} |
657
|
85 |
|
if ($working->tagName == $tag) { |
658
|
85 |
|
$this->current = $working->parentNode; |
659
|
|
|
|
660
|
85 |
|
return true; |
661
|
|
|
} |
662
|
48 |
|
} while ($working = $working->parentNode); |
663
|
|
|
return false; |
664
|
|
|
} |
665
|
|
|
|
666
|
|
|
/** |
667
|
|
|
* Checks if the given tagname is an ancestor of the present candidate. |
668
|
|
|
* |
669
|
|
|
* If $this->current or anything above $this->current matches the given tag |
670
|
|
|
* name, this returns true. |
671
|
|
|
*/ |
672
|
|
|
protected function isAncestor($tagname) |
673
|
|
|
{ |
674
|
|
|
$candidate = $this->current; |
675
|
|
|
while ($candidate->nodeType === XML_ELEMENT_NODE) { |
676
|
|
|
if ($candidate->tagName == $tagname) { |
677
|
|
|
return true; |
678
|
|
|
} |
679
|
|
|
$candidate = $candidate->parentNode; |
680
|
|
|
} |
681
|
|
|
|
682
|
|
|
return false; |
683
|
|
|
} |
684
|
|
|
|
685
|
|
|
/** |
686
|
|
|
* Returns true if the immediate parent element is of the given tagname. |
687
|
|
|
*/ |
688
|
|
|
protected function isParent($tagname) |
689
|
|
|
{ |
690
|
|
|
return $this->current->tagName == $tagname; |
691
|
|
|
} |
692
|
|
|
} |
If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:
If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.