1 | <?php |
||
13 | class ElementInspector |
||
14 | { |
||
15 | /** |
||
16 | * This is an abridged version of the HTML5 content models and rules, with some liberties taken. |
||
17 | * |
||
18 | * For each element, up to three bitfields are defined: "c", "ac" and "dd". Bitfields are stored |
||
19 | * as raw bytes, formatted using the octal notation to keep the sources ASCII. |
||
20 | * |
||
21 | * "c" represents the categories the element belongs to. The categories are comprised of HTML5 |
||
22 | * content models (such as "phrasing content" or "interactive content") plus a few special |
||
23 | * categories created to cover the parts of the specs that refer to "a group of X and Y |
||
24 | * elements" rather than a specific content model. |
||
25 | * |
||
26 | * "ac" represents the categories that are allowed as children of given element. |
||
27 | * |
||
28 | * "dd" represents the categories that must not appear as a descendant of given element. |
||
29 | * |
||
30 | * Sometimes, HTML5 specifies some restrictions on when an element can accept certain children, |
||
31 | * or what categories the element belongs to. For example, an <img> element is only part of the |
||
32 | * "interactive content" category if it has a "usemap" attribute. Those restrictions are |
||
33 | * expressed as an XPath expression and stored using the concatenation of the key of the bitfield |
||
34 | * plus the bit number of the category. For instance, if "interactive content" got assigned to |
||
35 | * bit 2, the definition of the <img> element will contain a key "c2" with value "@usemap". |
||
36 | * |
||
37 | * Additionally, other flags are set: |
||
38 | * |
||
39 | * "t" indicates that the element uses the "transparent" content model. |
||
40 | * "e" indicates that the element uses the "empty" content model. |
||
41 | * "v" indicates that the element is a void element. |
||
42 | * "nt" indicates that the element does not accept text nodes. (no text) |
||
43 | * "to" indicates that the element should only contain text. (text-only) |
||
44 | * "fe" indicates that the element is a formatting element. It will automatically be reopened |
||
45 | * when closed by an end tag of a different name. |
||
46 | * "b" indicates that the element is not phrasing content, which makes it likely to act like |
||
47 | * a block element. |
||
48 | * |
||
49 | * Finally, HTML5 defines "optional end tag" rules, where one element automatically closes its |
||
50 | * predecessor. Those are used to generate closeParent rules and are stored in the "cp" key. |
||
51 | * |
||
52 | * @var array |
||
53 | * @see /scripts/patchElementInspector.php |
||
54 | */ |
||
55 | protected static $htmlElements = [ |
||
56 | 'a'=>['c'=>"\17\0\0\0\0\1",'c3'=>'@href','ac'=>"\0",'dd'=>"\10\0\0\0\0\1",'t'=>1,'fe'=>1], |
||
57 | 'abbr'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"], |
||
58 | 'address'=>['c'=>"\3\40",'ac'=>"\1",'dd'=>"\0\45",'b'=>1,'cp'=>['p']], |
||
59 | 'article'=>['c'=>"\3\4",'ac'=>"\1",'dd'=>"\0",'b'=>1,'cp'=>['p']], |
||
60 | 'aside'=>['c'=>"\3\4",'ac'=>"\1",'dd'=>"\0\0\0\0\10",'b'=>1,'cp'=>['p']], |
||
61 | 'audio'=>['c'=>"\57",'c3'=>'@controls','c1'=>'@controls','ac'=>"\0\0\0\104",'ac26'=>'not(@src)','dd'=>"\0\0\0\0\0\2",'dd41'=>'@src','t'=>1], |
||
62 | 'b'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0",'fe'=>1], |
||
63 | 'base'=>['c'=>"\20",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1,'b'=>1], |
||
64 | 'bdi'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"], |
||
65 | 'bdo'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"], |
||
66 | 'blockquote'=>['c'=>"\203",'ac'=>"\1",'dd'=>"\0",'b'=>1,'cp'=>['p']], |
||
67 | 'body'=>['c'=>"\200\0\4",'ac'=>"\1",'dd'=>"\0",'b'=>1], |
||
68 | 'br'=>['c'=>"\5",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1], |
||
69 | 'button'=>['c'=>"\117",'ac'=>"\4",'dd'=>"\10"], |
||
70 | 'canvas'=>['c'=>"\47",'ac'=>"\0",'dd'=>"\0",'t'=>1], |
||
71 | 'caption'=>['c'=>"\0\2",'ac'=>"\1",'dd'=>"\0\0\0\200",'b'=>1], |
||
72 | 'cite'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"], |
||
73 | 'code'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0",'fe'=>1], |
||
74 | 'col'=>['c'=>"\0\0\20",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1,'b'=>1], |
||
75 | 'colgroup'=>['c'=>"\0\2",'ac'=>"\0\0\20",'ac20'=>'not(@span)','dd'=>"\0",'nt'=>1,'e'=>1,'e?'=>'@span','b'=>1], |
||
76 | 'data'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"], |
||
77 | 'datalist'=>['c'=>"\5",'ac'=>"\4\200\0\10",'dd'=>"\0"], |
||
78 | 'dd'=>['c'=>"\0\0\200",'ac'=>"\1",'dd'=>"\0",'b'=>1,'cp'=>['dd','dt']], |
||
79 | 'del'=>['c'=>"\5",'ac'=>"\0",'dd'=>"\0",'t'=>1], |
||
80 | 'details'=>['c'=>"\213",'ac'=>"\1\0\0\2",'dd'=>"\0",'b'=>1,'cp'=>['p']], |
||
81 | 'dfn'=>['c'=>"\7\0\0\0\40",'ac'=>"\4",'dd'=>"\0\0\0\0\40"], |
||
82 | 'div'=>['c'=>"\3",'ac'=>"\1",'dd'=>"\0",'b'=>1,'cp'=>['p']], |
||
83 | 'dl'=>['c'=>"\3",'c1'=>'dt and dd','ac'=>"\0\200\200",'dd'=>"\0",'nt'=>1,'b'=>1,'cp'=>['p']], |
||
84 | 'dt'=>['c'=>"\0\0\200",'ac'=>"\1",'dd'=>"\0\5\0\40",'b'=>1,'cp'=>['dd','dt']], |
||
85 | 'em'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0",'fe'=>1], |
||
86 | 'embed'=>['c'=>"\57",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1], |
||
87 | 'fieldset'=>['c'=>"\303",'ac'=>"\1\0\0\20",'dd'=>"\0",'b'=>1,'cp'=>['p']], |
||
88 | 'figcaption'=>['c'=>"\0\0\0\0\0\4",'ac'=>"\1",'dd'=>"\0",'b'=>1,'cp'=>['p']], |
||
89 | 'figure'=>['c'=>"\203",'ac'=>"\1\0\0\0\0\4",'dd'=>"\0",'b'=>1,'cp'=>['p']], |
||
90 | 'footer'=>['c'=>"\3\40",'ac'=>"\1",'dd'=>"\0\0\0\0\10",'b'=>1,'cp'=>['p']], |
||
91 | 'form'=>['c'=>"\3\0\0\0\20",'ac'=>"\1",'dd'=>"\0\0\0\0\20",'b'=>1,'cp'=>['p']], |
||
92 | 'h1'=>['c'=>"\3\1",'ac'=>"\4",'dd'=>"\0",'b'=>1,'cp'=>['p']], |
||
93 | 'h2'=>['c'=>"\3\1",'ac'=>"\4",'dd'=>"\0",'b'=>1,'cp'=>['p']], |
||
94 | 'h3'=>['c'=>"\3\1",'ac'=>"\4",'dd'=>"\0",'b'=>1,'cp'=>['p']], |
||
95 | 'h4'=>['c'=>"\3\1",'ac'=>"\4",'dd'=>"\0",'b'=>1,'cp'=>['p']], |
||
96 | 'h5'=>['c'=>"\3\1",'ac'=>"\4",'dd'=>"\0",'b'=>1,'cp'=>['p']], |
||
97 | 'h6'=>['c'=>"\3\1",'ac'=>"\4",'dd'=>"\0",'b'=>1,'cp'=>['p']], |
||
98 | 'head'=>['c'=>"\0\0\4",'ac'=>"\20",'dd'=>"\0",'nt'=>1,'b'=>1], |
||
99 | 'header'=>['c'=>"\3\40\0\40",'ac'=>"\1",'dd'=>"\0\0\0\0\10",'b'=>1,'cp'=>['p']], |
||
100 | 'hr'=>['c'=>"\1\100",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1,'b'=>1,'cp'=>['p']], |
||
101 | 'html'=>['c'=>"\0",'ac'=>"\0\0\4",'dd'=>"\0",'nt'=>1,'b'=>1], |
||
102 | 'i'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0",'fe'=>1], |
||
103 | 'iframe'=>['c'=>"\57",'ac'=>"\4",'dd'=>"\0"], |
||
104 | 'img'=>['c'=>"\57\20\10",'c3'=>'@usemap','ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1], |
||
105 | 'input'=>['c'=>"\17\20",'c3'=>'@type!="hidden"','c12'=>'@type!="hidden" or @type="hidden"','c1'=>'@type!="hidden"','ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1], |
||
106 | 'ins'=>['c'=>"\7",'ac'=>"\0",'dd'=>"\0",'t'=>1], |
||
107 | 'kbd'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"], |
||
108 | 'keygen'=>['c'=>"\117",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1], |
||
109 | 'label'=>['c'=>"\17\20\0\0\4",'ac'=>"\4",'dd'=>"\0\0\1\0\4"], |
||
110 | 'legend'=>['c'=>"\0\0\0\20",'ac'=>"\4",'dd'=>"\0",'b'=>1], |
||
111 | 'li'=>['c'=>"\0\0\0\0\200",'ac'=>"\1",'dd'=>"\0",'b'=>1,'cp'=>['li']], |
||
112 | 'link'=>['c'=>"\20",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1,'b'=>1], |
||
113 | 'main'=>['c'=>"\3\0\0\0\10",'ac'=>"\1",'dd'=>"\0",'b'=>1,'cp'=>['p']], |
||
114 | 'mark'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"], |
||
115 | 'media element'=>['c'=>"\0\0\0\0\0\2",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'b'=>1], |
||
116 | 'menu'=>['c'=>"\1\100",'ac'=>"\0\300",'dd'=>"\0",'nt'=>1,'b'=>1,'cp'=>['p']], |
||
117 | 'menuitem'=>['c'=>"\0\100",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1,'b'=>1], |
||
118 | 'meta'=>['c'=>"\20",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1,'b'=>1], |
||
119 | 'meter'=>['c'=>"\7\0\1\0\2",'ac'=>"\4",'dd'=>"\0\0\0\0\2"], |
||
120 | 'nav'=>['c'=>"\3\4",'ac'=>"\1",'dd'=>"\0\0\0\0\10",'b'=>1,'cp'=>['p']], |
||
121 | 'noscript'=>['c'=>"\25",'ac'=>"\0",'dd'=>"\0",'nt'=>1], |
||
122 | 'object'=>['c'=>"\147",'ac'=>"\0\0\0\0\1",'dd'=>"\0",'t'=>1], |
||
123 | 'ol'=>['c'=>"\3",'c1'=>'li','ac'=>"\0\200\0\0\200",'dd'=>"\0",'nt'=>1,'b'=>1,'cp'=>['p']], |
||
124 | 'optgroup'=>['c'=>"\0\0\2",'ac'=>"\0\200\0\10",'dd'=>"\0",'nt'=>1,'b'=>1,'cp'=>['optgroup','option']], |
||
125 | 'option'=>['c'=>"\0\0\2\10",'ac'=>"\0",'dd'=>"\0",'b'=>1,'cp'=>['option']], |
||
126 | 'output'=>['c'=>"\107",'ac'=>"\4",'dd'=>"\0"], |
||
127 | 'p'=>['c'=>"\3",'ac'=>"\4",'dd'=>"\0",'b'=>1,'cp'=>['p']], |
||
128 | 'param'=>['c'=>"\0\0\0\0\1",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1,'b'=>1], |
||
129 | 'picture'=>['c'=>"\45",'ac'=>"\0\200\10",'dd'=>"\0",'nt'=>1], |
||
130 | 'pre'=>['c'=>"\3",'ac'=>"\4",'dd'=>"\0",'pre'=>1,'b'=>1,'cp'=>['p']], |
||
131 | 'progress'=>['c'=>"\7\0\1\1",'ac'=>"\4",'dd'=>"\0\0\0\1"], |
||
132 | 'q'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"], |
||
133 | 'rb'=>['c'=>"\0\10",'ac'=>"\4",'dd'=>"\0",'b'=>1], |
||
134 | 'rp'=>['c'=>"\0\10\100",'ac'=>"\4",'dd'=>"\0",'b'=>1,'cp'=>['rp','rt']], |
||
135 | 'rt'=>['c'=>"\0\10\100",'ac'=>"\4",'dd'=>"\0",'b'=>1,'cp'=>['rp','rt']], |
||
136 | 'rtc'=>['c'=>"\0\10",'ac'=>"\4\0\100",'dd'=>"\0",'b'=>1], |
||
137 | 'ruby'=>['c'=>"\7",'ac'=>"\4\10",'dd'=>"\0"], |
||
138 | 's'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0",'fe'=>1], |
||
139 | 'samp'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"], |
||
140 | 'script'=>['c'=>"\25\200",'ac'=>"\0",'dd'=>"\0",'to'=>1], |
||
141 | 'section'=>['c'=>"\3\4",'ac'=>"\1",'dd'=>"\0",'b'=>1,'cp'=>['p']], |
||
142 | 'select'=>['c'=>"\117",'ac'=>"\0\200\2",'dd'=>"\0",'nt'=>1], |
||
143 | 'small'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0",'fe'=>1], |
||
144 | 'source'=>['c'=>"\0\0\10\4",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1,'b'=>1], |
||
145 | 'span'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"], |
||
146 | 'strong'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0",'fe'=>1], |
||
147 | 'style'=>['c'=>"\20",'ac'=>"\0",'dd'=>"\0",'to'=>1,'b'=>1], |
||
148 | 'sub'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"], |
||
149 | 'summary'=>['c'=>"\0\0\0\2",'ac'=>"\4\1",'dd'=>"\0",'b'=>1], |
||
150 | 'sup'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"], |
||
151 | 'table'=>['c'=>"\3\0\0\200",'ac'=>"\0\202",'dd'=>"\0",'nt'=>1,'b'=>1,'cp'=>['p']], |
||
152 | 'tbody'=>['c'=>"\0\2",'ac'=>"\0\200\0\0\100",'dd'=>"\0",'nt'=>1,'b'=>1,'cp'=>['tbody','td','tfoot','th','thead','tr']], |
||
153 | 'td'=>['c'=>"\200\0\40",'ac'=>"\1",'dd'=>"\0",'b'=>1,'cp'=>['td','th']], |
||
154 | 'template'=>['c'=>"\25\200\20",'ac'=>"\0",'dd'=>"\0",'nt'=>1], |
||
155 | 'textarea'=>['c'=>"\117",'ac'=>"\0",'dd'=>"\0",'pre'=>1,'to'=>1], |
||
156 | 'tfoot'=>['c'=>"\0\2",'ac'=>"\0\200\0\0\100",'dd'=>"\0",'nt'=>1,'b'=>1,'cp'=>['tbody','td','th','thead','tr']], |
||
157 | 'th'=>['c'=>"\0\0\40",'ac'=>"\1",'dd'=>"\0\5\0\40",'b'=>1,'cp'=>['td','th']], |
||
158 | 'thead'=>['c'=>"\0\2",'ac'=>"\0\200\0\0\100",'dd'=>"\0",'nt'=>1,'b'=>1], |
||
159 | 'time'=>['c'=>"\7",'ac'=>"\4",'ac2'=>'@datetime','dd'=>"\0"], |
||
160 | 'title'=>['c'=>"\20",'ac'=>"\0",'dd'=>"\0",'to'=>1,'b'=>1], |
||
161 | 'tr'=>['c'=>"\0\2\0\0\100",'ac'=>"\0\200\40",'dd'=>"\0",'nt'=>1,'b'=>1,'cp'=>['td','th','tr']], |
||
162 | 'track'=>['c'=>"\0\0\0\100",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1,'b'=>1], |
||
163 | 'u'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0",'fe'=>1], |
||
164 | 'ul'=>['c'=>"\3",'c1'=>'li','ac'=>"\0\200\0\0\200",'dd'=>"\0",'nt'=>1,'b'=>1,'cp'=>['p']], |
||
165 | 'var'=>['c'=>"\7",'ac'=>"\4",'dd'=>"\0"], |
||
166 | 'video'=>['c'=>"\57",'c3'=>'@controls','ac'=>"\0\0\0\104",'ac26'=>'not(@src)','dd'=>"\0\0\0\0\0\2",'dd41'=>'@src','t'=>1], |
||
167 | 'wbr'=>['c'=>"\5",'ac'=>"\0",'dd'=>"\0",'nt'=>1,'e'=>1,'v'=>1] |
||
168 | ]; |
||
169 | |||
170 | /** |
||
171 | * Test whether given child element closes given parent element |
||
172 | * |
||
173 | * @param DOMElement $child |
||
174 | * @param DOMElement $parent |
||
175 | * @return bool |
||
176 | */ |
||
177 | public static function closesParent(DOMElement $child, DOMElement $parent) |
||
184 | |||
185 | /** |
||
186 | * Test whether given element disallows text nodes |
||
187 | * |
||
188 | * @param DOMElement $element |
||
189 | * @return bool |
||
190 | */ |
||
191 | public static function disallowsText(DOMElement $element) |
||
195 | |||
196 | /** |
||
197 | * Return the "allowChild" bitfield for given element |
||
198 | * |
||
199 | * @param DOMElement $element |
||
200 | * @return string |
||
201 | */ |
||
202 | public static function getAllowChildBitfield(DOMElement $element) |
||
206 | |||
207 | /** |
||
208 | * Return the "category" bitfield for given element |
||
209 | * |
||
210 | * @param DOMElement $element |
||
211 | * @return string |
||
212 | */ |
||
213 | public static function getCategoryBitfield(DOMElement $element) |
||
217 | |||
218 | /** |
||
219 | * Return the "denyDescendant" bitfield for given element |
||
220 | * |
||
221 | * @param DOMElement $element |
||
222 | * @return string |
||
223 | */ |
||
224 | public static function getDenyDescendantBitfield(DOMElement $element) |
||
228 | |||
229 | /** |
||
230 | * Test whether given element is a block element |
||
231 | * |
||
232 | * @param DOMElement $element |
||
233 | * @return bool |
||
234 | */ |
||
235 | public static function isBlock(DOMElement $element) |
||
239 | |||
240 | /** |
||
241 | * Test whether given element uses the empty content model |
||
242 | * |
||
243 | * @param DOMElement $element |
||
244 | * @return bool |
||
245 | */ |
||
246 | public static function isEmpty(DOMElement $element) |
||
250 | |||
251 | /** |
||
252 | * Test whether given element is a formatting element |
||
253 | * |
||
254 | * @param DOMElement $element |
||
255 | * @return bool |
||
256 | */ |
||
257 | public static function isFormattingElement(DOMElement $element) |
||
261 | |||
262 | /** |
||
263 | * Test whether given element only accepts text nodes |
||
264 | * |
||
265 | * @param DOMElement $element |
||
266 | * @return bool |
||
267 | */ |
||
268 | public static function isTextOnly(DOMElement $element) |
||
272 | |||
273 | /** |
||
274 | * Test whether given element uses the transparent content model |
||
275 | * |
||
276 | * @param DOMElement $element |
||
277 | * @return bool |
||
278 | */ |
||
279 | public static function isTransparent(DOMElement $element) |
||
283 | |||
284 | /** |
||
285 | * Test whether given element uses the void content model |
||
286 | * |
||
287 | * @param DOMElement $element |
||
288 | * @return bool |
||
289 | */ |
||
290 | public static function isVoid(DOMElement $element) |
||
294 | |||
295 | /** |
||
296 | * Test whether given element preserves whitespace in its content |
||
297 | * |
||
298 | * @param DOMElement $element |
||
299 | * @return bool |
||
300 | */ |
||
301 | public static function preservesWhitespace(DOMElement $element) |
||
305 | |||
306 | /** |
||
307 | * Evaluate an XPath query using given element as context node |
||
308 | * |
||
309 | * @param string $query XPath query |
||
310 | * @param DOMElement $element Context node |
||
311 | * @return bool |
||
312 | */ |
||
313 | protected static function evaluate($query, DOMElement $element) |
||
319 | |||
320 | /** |
||
321 | * Get the bitfield value for a given element |
||
322 | * |
||
323 | * @param DOMElement $element Context node |
||
324 | * @param string $name Bitfield name: either 'c', 'ac' or 'dd' |
||
325 | * @return string |
||
326 | */ |
||
327 | protected static function getBitfield(DOMElement $element, $name) |
||
345 | |||
346 | /** |
||
347 | * Return the properties associated with given element |
||
348 | * |
||
349 | * Returns span's properties if the element is not defined |
||
350 | * |
||
351 | * @param DOMElement $element |
||
352 | * @return array |
||
353 | */ |
||
354 | protected static function getProperties(DOMElement $element) |
||
358 | |||
359 | /** |
||
360 | * Test whether given element has given property in context |
||
361 | * |
||
362 | * @param DOMElement $element Context node |
||
363 | * @param string $propName Property name, see self::$htmlElements |
||
364 | * @return bool |
||
365 | */ |
||
366 | protected static function hasProperty(DOMElement $element, $propName) |
||
372 | |||
373 | /** |
||
374 | * Convert a raw string to a series of 0 and 1 in LSB order |
||
375 | * |
||
376 | * @param string $raw |
||
377 | * @return string |
||
378 | */ |
||
379 | protected static function toBin($raw) |
||
389 | |||
390 | /** |
||
391 | * Convert a series of 0 and 1 in LSB order to a raw string |
||
392 | * |
||
393 | * @param string $bin |
||
394 | * @return string |
||
395 | */ |
||
396 | protected static function toRaw($bin) |
||
400 | } |