1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* HTML sanitizer for %MediaWiki. |
4
|
|
|
* |
5
|
|
|
* Copyright © 2002-2005 Brion Vibber <[email protected]> et al |
6
|
|
|
* https://www.mediawiki.org/ |
7
|
|
|
* |
8
|
|
|
* This program is free software; you can redistribute it and/or modify |
9
|
|
|
* it under the terms of the GNU General Public License as published by |
10
|
|
|
* the Free Software Foundation; either version 2 of the License, or |
11
|
|
|
* (at your option) any later version. |
12
|
|
|
* |
13
|
|
|
* This program is distributed in the hope that it will be useful, |
14
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
15
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16
|
|
|
* GNU General Public License for more details. |
17
|
|
|
* |
18
|
|
|
* You should have received a copy of the GNU General Public License along |
19
|
|
|
* with this program; if not, write to the Free Software Foundation, Inc., |
20
|
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. |
21
|
|
|
* http://www.gnu.org/copyleft/gpl.html |
22
|
|
|
* |
23
|
|
|
* @file |
24
|
|
|
* @ingroup Parser |
25
|
|
|
*/ |
26
|
|
|
|
27
|
|
|
/** |
28
|
|
|
* HTML sanitizer for MediaWiki |
29
|
|
|
* @ingroup Parser |
30
|
|
|
*/ |
31
|
|
|
class Sanitizer { |
32
|
|
|
/** |
33
|
|
|
* Regular expression to match various types of character references in |
34
|
|
|
* Sanitizer::normalizeCharReferences and Sanitizer::decodeCharReferences |
35
|
|
|
*/ |
36
|
|
|
const CHAR_REFS_REGEX = |
37
|
|
|
'/&([A-Za-z0-9\x80-\xff]+); |
38
|
|
|
|&\#([0-9]+); |
39
|
|
|
|&\#[xX]([0-9A-Fa-f]+); |
40
|
|
|
|(&)/x'; |
41
|
|
|
|
42
|
|
|
/** |
43
|
|
|
* Acceptable tag name charset from HTML5 parsing spec |
44
|
|
|
* https://www.w3.org/TR/html5/syntax.html#tag-open-state |
45
|
|
|
*/ |
46
|
|
|
const ELEMENT_BITS_REGEX = '!^(/?)([A-Za-z][^\t\n\v />\0]*+)([^>]*?)(/?>)([^<]*)$!'; |
47
|
|
|
|
48
|
|
|
/** |
49
|
|
|
* Blacklist for evil uris like javascript: |
50
|
|
|
* WARNING: DO NOT use this in any place that actually requires blacklisting |
51
|
|
|
* for security reasons. There are NUMEROUS[1] ways to bypass blacklisting, the |
52
|
|
|
* only way to be secure from javascript: uri based xss vectors is to whitelist |
53
|
|
|
* things that you know are safe and deny everything else. |
54
|
|
|
* [1]: http://ha.ckers.org/xss.html |
55
|
|
|
*/ |
56
|
|
|
const EVIL_URI_PATTERN = '!(^|\s|\*/\s*)(javascript|vbscript)([^\w]|$)!i'; |
57
|
|
|
const XMLNS_ATTRIBUTE_PATTERN = "/^xmlns:[:A-Z_a-z-.0-9]+$/"; |
58
|
|
|
|
59
|
|
|
/** |
60
|
|
|
* List of all named character entities defined in HTML 4.01 |
61
|
|
|
* https://www.w3.org/TR/html4/sgml/entities.html |
62
|
|
|
* As well as ' which is only defined starting in XHTML1. |
63
|
|
|
*/ |
64
|
|
|
private static $htmlEntities = [ |
65
|
|
|
'Aacute' => 193, |
66
|
|
|
'aacute' => 225, |
67
|
|
|
'Acirc' => 194, |
68
|
|
|
'acirc' => 226, |
69
|
|
|
'acute' => 180, |
70
|
|
|
'AElig' => 198, |
71
|
|
|
'aelig' => 230, |
72
|
|
|
'Agrave' => 192, |
73
|
|
|
'agrave' => 224, |
74
|
|
|
'alefsym' => 8501, |
75
|
|
|
'Alpha' => 913, |
76
|
|
|
'alpha' => 945, |
77
|
|
|
'amp' => 38, |
78
|
|
|
'and' => 8743, |
79
|
|
|
'ang' => 8736, |
80
|
|
|
'apos' => 39, // New in XHTML & HTML 5; avoid in output for compatibility with IE. |
81
|
|
|
'Aring' => 197, |
82
|
|
|
'aring' => 229, |
83
|
|
|
'asymp' => 8776, |
84
|
|
|
'Atilde' => 195, |
85
|
|
|
'atilde' => 227, |
86
|
|
|
'Auml' => 196, |
87
|
|
|
'auml' => 228, |
88
|
|
|
'bdquo' => 8222, |
89
|
|
|
'Beta' => 914, |
90
|
|
|
'beta' => 946, |
91
|
|
|
'brvbar' => 166, |
92
|
|
|
'bull' => 8226, |
93
|
|
|
'cap' => 8745, |
94
|
|
|
'Ccedil' => 199, |
95
|
|
|
'ccedil' => 231, |
96
|
|
|
'cedil' => 184, |
97
|
|
|
'cent' => 162, |
98
|
|
|
'Chi' => 935, |
99
|
|
|
'chi' => 967, |
100
|
|
|
'circ' => 710, |
101
|
|
|
'clubs' => 9827, |
102
|
|
|
'cong' => 8773, |
103
|
|
|
'copy' => 169, |
104
|
|
|
'crarr' => 8629, |
105
|
|
|
'cup' => 8746, |
106
|
|
|
'curren' => 164, |
107
|
|
|
'dagger' => 8224, |
108
|
|
|
'Dagger' => 8225, |
109
|
|
|
'darr' => 8595, |
110
|
|
|
'dArr' => 8659, |
111
|
|
|
'deg' => 176, |
112
|
|
|
'Delta' => 916, |
113
|
|
|
'delta' => 948, |
114
|
|
|
'diams' => 9830, |
115
|
|
|
'divide' => 247, |
116
|
|
|
'Eacute' => 201, |
117
|
|
|
'eacute' => 233, |
118
|
|
|
'Ecirc' => 202, |
119
|
|
|
'ecirc' => 234, |
120
|
|
|
'Egrave' => 200, |
121
|
|
|
'egrave' => 232, |
122
|
|
|
'empty' => 8709, |
123
|
|
|
'emsp' => 8195, |
124
|
|
|
'ensp' => 8194, |
125
|
|
|
'Epsilon' => 917, |
126
|
|
|
'epsilon' => 949, |
127
|
|
|
'equiv' => 8801, |
128
|
|
|
'Eta' => 919, |
129
|
|
|
'eta' => 951, |
130
|
|
|
'ETH' => 208, |
131
|
|
|
'eth' => 240, |
132
|
|
|
'Euml' => 203, |
133
|
|
|
'euml' => 235, |
134
|
|
|
'euro' => 8364, |
135
|
|
|
'exist' => 8707, |
136
|
|
|
'fnof' => 402, |
137
|
|
|
'forall' => 8704, |
138
|
|
|
'frac12' => 189, |
139
|
|
|
'frac14' => 188, |
140
|
|
|
'frac34' => 190, |
141
|
|
|
'frasl' => 8260, |
142
|
|
|
'Gamma' => 915, |
143
|
|
|
'gamma' => 947, |
144
|
|
|
'ge' => 8805, |
145
|
|
|
'gt' => 62, |
146
|
|
|
'harr' => 8596, |
147
|
|
|
'hArr' => 8660, |
148
|
|
|
'hearts' => 9829, |
149
|
|
|
'hellip' => 8230, |
150
|
|
|
'Iacute' => 205, |
151
|
|
|
'iacute' => 237, |
152
|
|
|
'Icirc' => 206, |
153
|
|
|
'icirc' => 238, |
154
|
|
|
'iexcl' => 161, |
155
|
|
|
'Igrave' => 204, |
156
|
|
|
'igrave' => 236, |
157
|
|
|
'image' => 8465, |
158
|
|
|
'infin' => 8734, |
159
|
|
|
'int' => 8747, |
160
|
|
|
'Iota' => 921, |
161
|
|
|
'iota' => 953, |
162
|
|
|
'iquest' => 191, |
163
|
|
|
'isin' => 8712, |
164
|
|
|
'Iuml' => 207, |
165
|
|
|
'iuml' => 239, |
166
|
|
|
'Kappa' => 922, |
167
|
|
|
'kappa' => 954, |
168
|
|
|
'Lambda' => 923, |
169
|
|
|
'lambda' => 955, |
170
|
|
|
'lang' => 9001, |
171
|
|
|
'laquo' => 171, |
172
|
|
|
'larr' => 8592, |
173
|
|
|
'lArr' => 8656, |
174
|
|
|
'lceil' => 8968, |
175
|
|
|
'ldquo' => 8220, |
176
|
|
|
'le' => 8804, |
177
|
|
|
'lfloor' => 8970, |
178
|
|
|
'lowast' => 8727, |
179
|
|
|
'loz' => 9674, |
180
|
|
|
'lrm' => 8206, |
181
|
|
|
'lsaquo' => 8249, |
182
|
|
|
'lsquo' => 8216, |
183
|
|
|
'lt' => 60, |
184
|
|
|
'macr' => 175, |
185
|
|
|
'mdash' => 8212, |
186
|
|
|
'micro' => 181, |
187
|
|
|
'middot' => 183, |
188
|
|
|
'minus' => 8722, |
189
|
|
|
'Mu' => 924, |
190
|
|
|
'mu' => 956, |
191
|
|
|
'nabla' => 8711, |
192
|
|
|
'nbsp' => 160, |
193
|
|
|
'ndash' => 8211, |
194
|
|
|
'ne' => 8800, |
195
|
|
|
'ni' => 8715, |
196
|
|
|
'not' => 172, |
197
|
|
|
'notin' => 8713, |
198
|
|
|
'nsub' => 8836, |
199
|
|
|
'Ntilde' => 209, |
200
|
|
|
'ntilde' => 241, |
201
|
|
|
'Nu' => 925, |
202
|
|
|
'nu' => 957, |
203
|
|
|
'Oacute' => 211, |
204
|
|
|
'oacute' => 243, |
205
|
|
|
'Ocirc' => 212, |
206
|
|
|
'ocirc' => 244, |
207
|
|
|
'OElig' => 338, |
208
|
|
|
'oelig' => 339, |
209
|
|
|
'Ograve' => 210, |
210
|
|
|
'ograve' => 242, |
211
|
|
|
'oline' => 8254, |
212
|
|
|
'Omega' => 937, |
213
|
|
|
'omega' => 969, |
214
|
|
|
'Omicron' => 927, |
215
|
|
|
'omicron' => 959, |
216
|
|
|
'oplus' => 8853, |
217
|
|
|
'or' => 8744, |
218
|
|
|
'ordf' => 170, |
219
|
|
|
'ordm' => 186, |
220
|
|
|
'Oslash' => 216, |
221
|
|
|
'oslash' => 248, |
222
|
|
|
'Otilde' => 213, |
223
|
|
|
'otilde' => 245, |
224
|
|
|
'otimes' => 8855, |
225
|
|
|
'Ouml' => 214, |
226
|
|
|
'ouml' => 246, |
227
|
|
|
'para' => 182, |
228
|
|
|
'part' => 8706, |
229
|
|
|
'permil' => 8240, |
230
|
|
|
'perp' => 8869, |
231
|
|
|
'Phi' => 934, |
232
|
|
|
'phi' => 966, |
233
|
|
|
'Pi' => 928, |
234
|
|
|
'pi' => 960, |
235
|
|
|
'piv' => 982, |
236
|
|
|
'plusmn' => 177, |
237
|
|
|
'pound' => 163, |
238
|
|
|
'prime' => 8242, |
239
|
|
|
'Prime' => 8243, |
240
|
|
|
'prod' => 8719, |
241
|
|
|
'prop' => 8733, |
242
|
|
|
'Psi' => 936, |
243
|
|
|
'psi' => 968, |
244
|
|
|
'quot' => 34, |
245
|
|
|
'radic' => 8730, |
246
|
|
|
'rang' => 9002, |
247
|
|
|
'raquo' => 187, |
248
|
|
|
'rarr' => 8594, |
249
|
|
|
'rArr' => 8658, |
250
|
|
|
'rceil' => 8969, |
251
|
|
|
'rdquo' => 8221, |
252
|
|
|
'real' => 8476, |
253
|
|
|
'reg' => 174, |
254
|
|
|
'rfloor' => 8971, |
255
|
|
|
'Rho' => 929, |
256
|
|
|
'rho' => 961, |
257
|
|
|
'rlm' => 8207, |
258
|
|
|
'rsaquo' => 8250, |
259
|
|
|
'rsquo' => 8217, |
260
|
|
|
'sbquo' => 8218, |
261
|
|
|
'Scaron' => 352, |
262
|
|
|
'scaron' => 353, |
263
|
|
|
'sdot' => 8901, |
264
|
|
|
'sect' => 167, |
265
|
|
|
'shy' => 173, |
266
|
|
|
'Sigma' => 931, |
267
|
|
|
'sigma' => 963, |
268
|
|
|
'sigmaf' => 962, |
269
|
|
|
'sim' => 8764, |
270
|
|
|
'spades' => 9824, |
271
|
|
|
'sub' => 8834, |
272
|
|
|
'sube' => 8838, |
273
|
|
|
'sum' => 8721, |
274
|
|
|
'sup' => 8835, |
275
|
|
|
'sup1' => 185, |
276
|
|
|
'sup2' => 178, |
277
|
|
|
'sup3' => 179, |
278
|
|
|
'supe' => 8839, |
279
|
|
|
'szlig' => 223, |
280
|
|
|
'Tau' => 932, |
281
|
|
|
'tau' => 964, |
282
|
|
|
'there4' => 8756, |
283
|
|
|
'Theta' => 920, |
284
|
|
|
'theta' => 952, |
285
|
|
|
'thetasym' => 977, |
286
|
|
|
'thinsp' => 8201, |
287
|
|
|
'THORN' => 222, |
288
|
|
|
'thorn' => 254, |
289
|
|
|
'tilde' => 732, |
290
|
|
|
'times' => 215, |
291
|
|
|
'trade' => 8482, |
292
|
|
|
'Uacute' => 218, |
293
|
|
|
'uacute' => 250, |
294
|
|
|
'uarr' => 8593, |
295
|
|
|
'uArr' => 8657, |
296
|
|
|
'Ucirc' => 219, |
297
|
|
|
'ucirc' => 251, |
298
|
|
|
'Ugrave' => 217, |
299
|
|
|
'ugrave' => 249, |
300
|
|
|
'uml' => 168, |
301
|
|
|
'upsih' => 978, |
302
|
|
|
'Upsilon' => 933, |
303
|
|
|
'upsilon' => 965, |
304
|
|
|
'Uuml' => 220, |
305
|
|
|
'uuml' => 252, |
306
|
|
|
'weierp' => 8472, |
307
|
|
|
'Xi' => 926, |
308
|
|
|
'xi' => 958, |
309
|
|
|
'Yacute' => 221, |
310
|
|
|
'yacute' => 253, |
311
|
|
|
'yen' => 165, |
312
|
|
|
'Yuml' => 376, |
313
|
|
|
'yuml' => 255, |
314
|
|
|
'Zeta' => 918, |
315
|
|
|
'zeta' => 950, |
316
|
|
|
'zwj' => 8205, |
317
|
|
|
'zwnj' => 8204 |
318
|
|
|
]; |
319
|
|
|
|
320
|
|
|
/** |
321
|
|
|
* Character entity aliases accepted by MediaWiki |
322
|
|
|
*/ |
323
|
|
|
private static $htmlEntityAliases = [ |
324
|
|
|
'רלמ' => 'rlm', |
325
|
|
|
'رلم' => 'rlm', |
326
|
|
|
]; |
327
|
|
|
|
328
|
|
|
/** |
329
|
|
|
* Lazy-initialised attributes regex, see getAttribsRegex() |
330
|
|
|
*/ |
331
|
|
|
private static $attribsRegex; |
332
|
|
|
|
333
|
|
|
/** |
334
|
|
|
* Regular expression to match HTML/XML attribute pairs within a tag. |
335
|
|
|
* Allows some... latitude. Based on, |
336
|
|
|
* https://www.w3.org/TR/html5/syntax.html#before-attribute-value-state |
337
|
|
|
* Used in Sanitizer::fixTagAttributes and Sanitizer::decodeTagAttributes |
338
|
|
|
* @return string |
339
|
|
|
*/ |
340
|
|
|
static function getAttribsRegex() { |
341
|
|
|
if ( self::$attribsRegex === null ) { |
342
|
|
|
$attribFirst = '[:A-Z_a-z0-9]'; |
343
|
|
|
$attrib = '[:A-Z_a-z-.0-9]'; |
344
|
|
|
$space = '[\x09\x0a\x0c\x0d\x20]'; |
345
|
|
|
self::$attribsRegex = |
346
|
|
|
"/(?:^|$space)({$attribFirst}{$attrib}*) |
347
|
|
|
($space*=$space* |
348
|
|
|
(?: |
349
|
|
|
# The attribute value: quoted or alone |
350
|
|
|
\"([^\"]*)(?:\"|\$) |
351
|
|
|
| '([^']*)(?:'|\$) |
352
|
|
|
| (((?!$space|>).)*) |
353
|
|
|
) |
354
|
|
|
)?(?=$space|\$)/sx"; |
355
|
|
|
} |
356
|
|
|
return self::$attribsRegex; |
357
|
|
|
} |
358
|
|
|
|
359
|
|
|
/** |
360
|
|
|
* Return the various lists of recognized tags |
361
|
|
|
* @param array $extratags For any extra tags to include |
362
|
|
|
* @param array $removetags For any tags (default or extra) to exclude |
363
|
|
|
* @return array |
364
|
|
|
*/ |
365
|
|
|
public static function getRecognizedTagData( $extratags = [], $removetags = [] ) { |
366
|
|
|
global $wgAllowImageTag; |
367
|
|
|
|
368
|
|
|
static $htmlpairsStatic, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags, |
369
|
|
|
$htmllist, $listtags, $htmlsingleallowed, $htmlelementsStatic, $staticInitialised; |
370
|
|
|
|
371
|
|
|
// Base our staticInitialised variable off of the global config state so that if the globals |
372
|
|
|
// are changed (like in the screwed up test system) we will re-initialise the settings. |
373
|
|
|
$globalContext = $wgAllowImageTag; |
374
|
|
|
if ( !$staticInitialised || $staticInitialised != $globalContext ) { |
375
|
|
|
$htmlpairsStatic = [ # Tags that must be closed |
376
|
|
|
'b', 'bdi', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1', |
377
|
|
|
'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's', |
378
|
|
|
'strike', 'strong', 'tt', 'var', 'div', 'center', |
379
|
|
|
'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre', |
380
|
|
|
'ruby', 'rb', 'rp', 'rt', 'rtc', 'p', 'span', 'abbr', 'dfn', |
381
|
|
|
'kbd', 'samp', 'data', 'time', 'mark' |
382
|
|
|
]; |
383
|
|
|
$htmlsingle = [ |
384
|
|
|
'br', 'wbr', 'hr', 'li', 'dt', 'dd', 'meta', 'link' |
385
|
|
|
]; |
386
|
|
|
|
387
|
|
|
# Elements that cannot have close tags. This is (not coincidentally) |
388
|
|
|
# also the list of tags for which the HTML 5 parsing algorithm |
389
|
|
|
# requires you to "acknowledge the token's self-closing flag", i.e. |
390
|
|
|
# a self-closing tag like <br/> is not an HTML 5 parse error only |
391
|
|
|
# for this list. |
392
|
|
|
$htmlsingleonly = [ |
393
|
|
|
'br', 'wbr', 'hr', 'meta', 'link' |
394
|
|
|
]; |
395
|
|
|
|
396
|
|
|
$htmlnest = [ # Tags that can be nested--?? |
397
|
|
|
'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul', |
398
|
|
|
'li', 'dl', 'dt', 'dd', 'font', 'big', 'small', 'sub', 'sup', 'span', |
399
|
|
|
'var', 'kbd', 'samp', 'em', 'strong', 'q', 'ruby', 'bdo' |
400
|
|
|
]; |
401
|
|
|
$tabletags = [ # Can only appear inside table, we will close them |
402
|
|
|
'td', 'th', 'tr', |
403
|
|
|
]; |
404
|
|
|
$htmllist = [ # Tags used by list |
405
|
|
|
'ul', 'ol', |
406
|
|
|
]; |
407
|
|
|
$listtags = [ # Tags that can appear in a list |
408
|
|
|
'li', |
409
|
|
|
]; |
410
|
|
|
|
411
|
|
|
if ( $wgAllowImageTag ) { |
412
|
|
|
$htmlsingle[] = 'img'; |
413
|
|
|
$htmlsingleonly[] = 'img'; |
414
|
|
|
} |
415
|
|
|
|
416
|
|
|
$htmlsingleallowed = array_unique( array_merge( $htmlsingle, $tabletags ) ); |
417
|
|
|
$htmlelementsStatic = array_unique( array_merge( $htmlsingle, $htmlpairsStatic, $htmlnest ) ); |
418
|
|
|
|
419
|
|
|
# Convert them all to hashtables for faster lookup |
420
|
|
|
$vars = [ 'htmlpairsStatic', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags', |
421
|
|
|
'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelementsStatic' ]; |
422
|
|
|
foreach ( $vars as $var ) { |
423
|
|
|
$$var = array_flip( $$var ); |
424
|
|
|
} |
425
|
|
|
$staticInitialised = $globalContext; |
426
|
|
|
} |
427
|
|
|
|
428
|
|
|
# Populate $htmlpairs and $htmlelements with the $extratags and $removetags arrays |
429
|
|
|
$extratags = array_flip( $extratags ); |
430
|
|
|
$removetags = array_flip( $removetags ); |
431
|
|
|
$htmlpairs = array_merge( $extratags, $htmlpairsStatic ); |
432
|
|
|
$htmlelements = array_diff_key( array_merge( $extratags, $htmlelementsStatic ), $removetags ); |
433
|
|
|
|
434
|
|
|
return [ |
435
|
|
|
'htmlpairs' => $htmlpairs, |
436
|
|
|
'htmlsingle' => $htmlsingle, |
437
|
|
|
'htmlsingleonly' => $htmlsingleonly, |
438
|
|
|
'htmlnest' => $htmlnest, |
439
|
|
|
'tabletags' => $tabletags, |
440
|
|
|
'htmllist' => $htmllist, |
441
|
|
|
'listtags' => $listtags, |
442
|
|
|
'htmlsingleallowed' => $htmlsingleallowed, |
443
|
|
|
'htmlelements' => $htmlelements, |
444
|
|
|
]; |
445
|
|
|
} |
446
|
|
|
|
447
|
|
|
/** |
448
|
|
|
* Cleans up HTML, removes dangerous tags and attributes, and |
449
|
|
|
* removes HTML comments |
450
|
|
|
* @param string $text |
451
|
|
|
* @param callable $processCallback Callback to do any variable or parameter |
452
|
|
|
* replacements in HTML attribute values |
453
|
|
|
* @param array|bool $args Arguments for the processing callback |
454
|
|
|
* @param array $extratags For any extra tags to include |
455
|
|
|
* @param array $removetags For any tags (default or extra) to exclude |
456
|
|
|
* @param callable $warnCallback (Deprecated) Callback allowing the |
457
|
|
|
* addition of a tracking category when bad input is encountered. |
458
|
|
|
* DO NOT ADD NEW PARAMETERS AFTER $warnCallback, since it will be |
459
|
|
|
* removed shortly. |
460
|
|
|
* @return string |
461
|
|
|
*/ |
462
|
|
|
public static function removeHTMLtags( $text, $processCallback = null, |
463
|
|
|
$args = [], $extratags = [], $removetags = [], $warnCallback = null |
464
|
|
|
) { |
465
|
|
|
extract( self::getRecognizedTagData( $extratags, $removetags ) ); |
|
|
|
|
466
|
|
|
|
467
|
|
|
# Remove HTML comments |
468
|
|
|
$text = Sanitizer::removeHTMLcomments( $text ); |
469
|
|
|
$bits = explode( '<', $text ); |
470
|
|
|
$text = str_replace( '>', '>', array_shift( $bits ) ); |
471
|
|
|
if ( !MWTidy::isEnabled() ) { |
472
|
|
|
$tagstack = $tablestack = []; |
473
|
|
|
foreach ( $bits as $x ) { |
474
|
|
|
$regs = []; |
475
|
|
|
# $slash: Does the current element start with a '/'? |
476
|
|
|
# $t: Current element name |
477
|
|
|
# $params: String between element name and > |
478
|
|
|
# $brace: Ending '>' or '/>' |
479
|
|
|
# $rest: Everything until the next element of $bits |
480
|
|
|
if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) { |
481
|
|
|
list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; |
482
|
|
|
} else { |
483
|
|
|
$slash = $t = $params = $brace = $rest = null; |
484
|
|
|
} |
485
|
|
|
|
486
|
|
|
$badtag = false; |
487
|
|
|
$t = strtolower( $t ); |
488
|
|
|
if ( isset( $htmlelements[$t] ) ) { |
489
|
|
|
# Check our stack |
490
|
|
|
if ( $slash && isset( $htmlsingleonly[$t] ) ) { |
491
|
|
|
$badtag = true; |
492
|
|
|
} elseif ( $slash ) { |
493
|
|
|
# Closing a tag... is it the one we just opened? |
494
|
|
|
MediaWiki\suppressWarnings(); |
495
|
|
|
$ot = array_pop( $tagstack ); |
496
|
|
|
MediaWiki\restoreWarnings(); |
497
|
|
|
|
498
|
|
|
if ( $ot != $t ) { |
499
|
|
|
if ( isset( $htmlsingleallowed[$ot] ) ) { |
500
|
|
|
# Pop all elements with an optional close tag |
501
|
|
|
# and see if we find a match below them |
502
|
|
|
$optstack = []; |
503
|
|
|
array_push( $optstack, $ot ); |
504
|
|
|
MediaWiki\suppressWarnings(); |
505
|
|
|
$ot = array_pop( $tagstack ); |
506
|
|
|
MediaWiki\restoreWarnings(); |
507
|
|
|
while ( $ot != $t && isset( $htmlsingleallowed[$ot] ) ) { |
508
|
|
|
array_push( $optstack, $ot ); |
509
|
|
|
MediaWiki\suppressWarnings(); |
510
|
|
|
$ot = array_pop( $tagstack ); |
511
|
|
|
MediaWiki\restoreWarnings(); |
512
|
|
|
} |
513
|
|
|
if ( $t != $ot ) { |
514
|
|
|
# No match. Push the optional elements back again |
515
|
|
|
$badtag = true; |
516
|
|
|
MediaWiki\suppressWarnings(); |
517
|
|
|
$ot = array_pop( $optstack ); |
518
|
|
|
MediaWiki\restoreWarnings(); |
519
|
|
|
while ( $ot ) { |
520
|
|
|
array_push( $tagstack, $ot ); |
521
|
|
|
MediaWiki\suppressWarnings(); |
522
|
|
|
$ot = array_pop( $optstack ); |
523
|
|
|
MediaWiki\restoreWarnings(); |
524
|
|
|
} |
525
|
|
|
} |
526
|
|
|
} else { |
527
|
|
|
MediaWiki\suppressWarnings(); |
528
|
|
|
array_push( $tagstack, $ot ); |
529
|
|
|
MediaWiki\restoreWarnings(); |
530
|
|
|
|
531
|
|
|
# <li> can be nested in <ul> or <ol>, skip those cases: |
532
|
|
|
if ( !isset( $htmllist[$ot] ) || !isset( $listtags[$t] ) ) { |
533
|
|
|
$badtag = true; |
534
|
|
|
} |
535
|
|
|
} |
536
|
|
|
} else { |
537
|
|
|
if ( $t == 'table' ) { |
538
|
|
|
$tagstack = array_pop( $tablestack ); |
539
|
|
|
} |
540
|
|
|
} |
541
|
|
|
$newparams = ''; |
542
|
|
|
} else { |
543
|
|
|
# Keep track for later |
544
|
|
|
if ( isset( $tabletags[$t] ) && !in_array( 'table', $tagstack ) ) { |
545
|
|
|
$badtag = true; |
546
|
|
|
} elseif ( in_array( $t, $tagstack ) && !isset( $htmlnest[$t] ) ) { |
547
|
|
|
$badtag = true; |
548
|
|
|
# Is it a self closed htmlpair ? (bug 5487) |
549
|
|
|
} elseif ( $brace == '/>' && isset( $htmlpairs[$t] ) ) { |
550
|
|
|
// Eventually we'll just remove the self-closing |
551
|
|
|
// slash, in order to be consistent with HTML5 |
552
|
|
|
// semantics. |
553
|
|
|
// $brace = '>'; |
554
|
|
|
// For now, let's just warn authors to clean up. |
555
|
|
|
if ( is_callable( $warnCallback ) ) { |
556
|
|
|
call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] ); |
557
|
|
|
} |
558
|
|
|
$badtag = true; |
559
|
|
|
} elseif ( isset( $htmlsingleonly[$t] ) ) { |
560
|
|
|
# Hack to force empty tag for unclosable elements |
561
|
|
|
$brace = '/>'; |
562
|
|
|
} elseif ( isset( $htmlsingle[$t] ) ) { |
563
|
|
|
# Hack to not close $htmlsingle tags |
564
|
|
|
$brace = null; |
565
|
|
|
# Still need to push this optionally-closed tag to |
566
|
|
|
# the tag stack so that we can match end tags |
567
|
|
|
# instead of marking them as bad. |
568
|
|
|
array_push( $tagstack, $t ); |
569
|
|
|
} elseif ( isset( $tabletags[$t] ) && in_array( $t, $tagstack ) ) { |
570
|
|
|
// New table tag but forgot to close the previous one |
571
|
|
|
$text .= "</$t>"; |
572
|
|
|
} else { |
573
|
|
|
if ( $t == 'table' ) { |
574
|
|
|
array_push( $tablestack, $tagstack ); |
575
|
|
|
$tagstack = []; |
576
|
|
|
} |
577
|
|
|
array_push( $tagstack, $t ); |
578
|
|
|
} |
579
|
|
|
|
580
|
|
|
# Replace any variables or template parameters with |
581
|
|
|
# plaintext results. |
582
|
|
|
if ( is_callable( $processCallback ) ) { |
583
|
|
|
call_user_func_array( $processCallback, [ &$params, $args ] ); |
584
|
|
|
} |
585
|
|
|
|
586
|
|
|
if ( !Sanitizer::validateTag( $params, $t ) ) { |
587
|
|
|
$badtag = true; |
588
|
|
|
} |
589
|
|
|
|
590
|
|
|
# Strip non-approved attributes from the tag |
591
|
|
|
$newparams = Sanitizer::fixTagAttributes( $params, $t ); |
592
|
|
|
} |
593
|
|
|
if ( !$badtag ) { |
594
|
|
|
$rest = str_replace( '>', '>', $rest ); |
595
|
|
|
$close = ( $brace == '/>' && !$slash ) ? ' /' : ''; |
596
|
|
|
$text .= "<$slash$t$newparams$close>$rest"; |
597
|
|
|
continue; |
598
|
|
|
} |
599
|
|
|
} |
600
|
|
|
$text .= '<' . str_replace( '>', '>', $x ); |
601
|
|
|
} |
602
|
|
|
# Close off any remaining tags |
603
|
|
|
while ( is_array( $tagstack ) && ( $t = array_pop( $tagstack ) ) ) { |
604
|
|
|
$text .= "</$t>\n"; |
605
|
|
|
if ( $t == 'table' ) { |
606
|
|
|
$tagstack = array_pop( $tablestack ); |
607
|
|
|
} |
608
|
|
|
} |
609
|
|
|
} else { |
610
|
|
|
# this might be possible using tidy itself |
611
|
|
|
foreach ( $bits as $x ) { |
612
|
|
|
if ( preg_match( self::ELEMENT_BITS_REGEX, $x, $regs ) ) { |
613
|
|
|
list( /* $qbar */, $slash, $t, $params, $brace, $rest ) = $regs; |
614
|
|
|
|
615
|
|
|
$badtag = false; |
616
|
|
|
$t = strtolower( $t ); |
617
|
|
|
if ( isset( $htmlelements[$t] ) ) { |
618
|
|
|
if ( is_callable( $processCallback ) ) { |
619
|
|
|
call_user_func_array( $processCallback, [ &$params, $args ] ); |
620
|
|
|
} |
621
|
|
|
|
622
|
|
|
if ( $brace == '/>' && !( isset( $htmlsingle[$t] ) || isset( $htmlsingleonly[$t] ) ) ) { |
623
|
|
|
// Eventually we'll just remove the self-closing |
624
|
|
|
// slash, in order to be consistent with HTML5 |
625
|
|
|
// semantics. |
626
|
|
|
// $brace = '>'; |
627
|
|
|
// For now, let's just warn authors to clean up. |
628
|
|
|
if ( is_callable( $warnCallback ) ) { |
629
|
|
|
call_user_func_array( $warnCallback, [ 'deprecated-self-close-category' ] ); |
630
|
|
|
} |
631
|
|
|
} |
632
|
|
|
if ( !Sanitizer::validateTag( $params, $t ) ) { |
633
|
|
|
$badtag = true; |
634
|
|
|
} |
635
|
|
|
|
636
|
|
|
$newparams = Sanitizer::fixTagAttributes( $params, $t ); |
637
|
|
|
if ( !$badtag ) { |
638
|
|
|
if ( $brace === '/>' && !isset( $htmlsingleonly[$t] ) ) { |
639
|
|
|
# Interpret self-closing tags as empty tags even when |
640
|
|
|
# HTML 5 would interpret them as start tags. Such input |
641
|
|
|
# is commonly seen on Wikimedia wikis with this intention. |
642
|
|
|
$brace = "></$t>"; |
643
|
|
|
} |
644
|
|
|
|
645
|
|
|
$rest = str_replace( '>', '>', $rest ); |
646
|
|
|
$text .= "<$slash$t$newparams$brace$rest"; |
647
|
|
|
continue; |
648
|
|
|
} |
649
|
|
|
} |
650
|
|
|
} |
651
|
|
|
$text .= '<' . str_replace( '>', '>', $x ); |
652
|
|
|
} |
653
|
|
|
} |
654
|
|
|
return $text; |
655
|
|
|
} |
656
|
|
|
|
657
|
|
|
/** |
658
|
|
|
* Remove '<!--', '-->', and everything between. |
659
|
|
|
* To avoid leaving blank lines, when a comment is both preceded |
660
|
|
|
* and followed by a newline (ignoring spaces), trim leading and |
661
|
|
|
* trailing spaces and one of the newlines. |
662
|
|
|
* |
663
|
|
|
* @param string $text |
664
|
|
|
* @return string |
665
|
|
|
*/ |
666
|
|
|
public static function removeHTMLcomments( $text ) { |
667
|
|
|
while ( ( $start = strpos( $text, '<!--' ) ) !== false ) { |
668
|
|
|
$end = strpos( $text, '-->', $start + 4 ); |
669
|
|
|
if ( $end === false ) { |
670
|
|
|
# Unterminated comment; bail out |
671
|
|
|
break; |
672
|
|
|
} |
673
|
|
|
|
674
|
|
|
$end += 3; |
675
|
|
|
|
676
|
|
|
# Trim space and newline if the comment is both |
677
|
|
|
# preceded and followed by a newline |
678
|
|
|
$spaceStart = max( $start - 1, 0 ); |
679
|
|
|
$spaceLen = $end - $spaceStart; |
680
|
|
|
while ( substr( $text, $spaceStart, 1 ) === ' ' && $spaceStart > 0 ) { |
681
|
|
|
$spaceStart--; |
682
|
|
|
$spaceLen++; |
683
|
|
|
} |
684
|
|
|
while ( substr( $text, $spaceStart + $spaceLen, 1 ) === ' ' ) { |
685
|
|
|
$spaceLen++; |
686
|
|
|
} |
687
|
|
|
if ( substr( $text, $spaceStart, 1 ) === "\n" |
688
|
|
|
&& substr( $text, $spaceStart + $spaceLen, 1 ) === "\n" ) { |
689
|
|
|
# Remove the comment, leading and trailing |
690
|
|
|
# spaces, and leave only one newline. |
691
|
|
|
$text = substr_replace( $text, "\n", $spaceStart, $spaceLen + 1 ); |
692
|
|
|
} else { |
693
|
|
|
# Remove just the comment. |
694
|
|
|
$text = substr_replace( $text, '', $start, $end - $start ); |
695
|
|
|
} |
696
|
|
|
} |
697
|
|
|
return $text; |
698
|
|
|
} |
699
|
|
|
|
700
|
|
|
/** |
701
|
|
|
* Takes attribute names and values for a tag and the tag name and |
702
|
|
|
* validates that the tag is allowed to be present. |
703
|
|
|
* This DOES NOT validate the attributes, nor does it validate the |
704
|
|
|
* tags themselves. This method only handles the special circumstances |
705
|
|
|
* where we may want to allow a tag within content but ONLY when it has |
706
|
|
|
* specific attributes set. |
707
|
|
|
* |
708
|
|
|
* @param string $params |
709
|
|
|
* @param string $element |
710
|
|
|
* @return bool |
711
|
|
|
*/ |
712
|
|
|
static function validateTag( $params, $element ) { |
713
|
|
|
$params = Sanitizer::decodeTagAttributes( $params ); |
714
|
|
|
|
715
|
|
|
if ( $element == 'meta' || $element == 'link' ) { |
716
|
|
|
if ( !isset( $params['itemprop'] ) ) { |
717
|
|
|
// <meta> and <link> must have an itemprop="" otherwise they are not valid or safe in content |
718
|
|
|
return false; |
719
|
|
|
} |
720
|
|
|
if ( $element == 'meta' && !isset( $params['content'] ) ) { |
721
|
|
|
// <meta> must have a content="" for the itemprop |
722
|
|
|
return false; |
723
|
|
|
} |
724
|
|
|
if ( $element == 'link' && !isset( $params['href'] ) ) { |
725
|
|
|
// <link> must have an associated href="" |
726
|
|
|
return false; |
727
|
|
|
} |
728
|
|
|
} |
729
|
|
|
|
730
|
|
|
return true; |
731
|
|
|
} |
732
|
|
|
|
733
|
|
|
/** |
734
|
|
|
* Take an array of attribute names and values and normalize or discard |
735
|
|
|
* illegal values for the given element type. |
736
|
|
|
* |
737
|
|
|
* - Discards attributes not on a whitelist for the given element |
738
|
|
|
* - Unsafe style attributes are discarded |
739
|
|
|
* - Invalid id attributes are re-encoded |
740
|
|
|
* |
741
|
|
|
* @param array $attribs |
742
|
|
|
* @param string $element |
743
|
|
|
* @return array |
744
|
|
|
* |
745
|
|
|
* @todo Check for legal values where the DTD limits things. |
746
|
|
|
* @todo Check for unique id attribute :P |
747
|
|
|
*/ |
748
|
|
|
static function validateTagAttributes( $attribs, $element ) { |
749
|
|
|
return Sanitizer::validateAttributes( $attribs, |
750
|
|
|
Sanitizer::attributeWhitelist( $element ) ); |
751
|
|
|
} |
752
|
|
|
|
753
|
|
|
/** |
754
|
|
|
* Take an array of attribute names and values and normalize or discard |
755
|
|
|
* illegal values for the given whitelist. |
756
|
|
|
* |
757
|
|
|
* - Discards attributes not on the given whitelist |
758
|
|
|
* - Unsafe style attributes are discarded |
759
|
|
|
* - Invalid id attributes are re-encoded |
760
|
|
|
* |
761
|
|
|
* @param array $attribs |
762
|
|
|
* @param array $whitelist List of allowed attribute names |
763
|
|
|
* @return array |
764
|
|
|
* |
765
|
|
|
* @todo Check for legal values where the DTD limits things. |
766
|
|
|
* @todo Check for unique id attribute :P |
767
|
|
|
*/ |
768
|
|
|
static function validateAttributes( $attribs, $whitelist ) { |
769
|
|
|
$whitelist = array_flip( $whitelist ); |
770
|
|
|
$hrefExp = '/^(' . wfUrlProtocols() . ')[^\s]+$/'; |
771
|
|
|
|
772
|
|
|
$out = []; |
773
|
|
|
foreach ( $attribs as $attribute => $value ) { |
774
|
|
|
# Allow XML namespace declaration to allow RDFa |
775
|
|
|
if ( preg_match( self::XMLNS_ATTRIBUTE_PATTERN, $attribute ) ) { |
776
|
|
|
if ( !preg_match( self::EVIL_URI_PATTERN, $value ) ) { |
777
|
|
|
$out[$attribute] = $value; |
778
|
|
|
} |
779
|
|
|
|
780
|
|
|
continue; |
781
|
|
|
} |
782
|
|
|
|
783
|
|
|
# Allow any attribute beginning with "data-" |
784
|
|
|
# However: |
785
|
|
|
# * data-ooui is reserved for ooui |
786
|
|
|
# * data-mw and data-parsoid are reserved for parsoid |
787
|
|
|
# * data-mw-<name here> is reserved for extensions (or core) if |
788
|
|
|
# they need to communicate some data to the client and want to be |
789
|
|
|
# sure that it isn't coming from an untrusted user. |
790
|
|
|
# * Ensure that the attribute is not namespaced by banning |
791
|
|
|
# colons. |
792
|
|
|
if ( !preg_match( '/^data-(?!ooui|mw|parsoid)[^:]*$/i', $attribute ) |
793
|
|
|
&& !isset( $whitelist[$attribute] ) |
794
|
|
|
) { |
795
|
|
|
continue; |
796
|
|
|
} |
797
|
|
|
|
798
|
|
|
# Strip javascript "expression" from stylesheets. |
799
|
|
|
# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp |
800
|
|
|
if ( $attribute == 'style' ) { |
801
|
|
|
$value = Sanitizer::checkCss( $value ); |
802
|
|
|
} |
803
|
|
|
|
804
|
|
|
# Escape HTML id attributes |
805
|
|
|
if ( $attribute === 'id' ) { |
806
|
|
|
$value = Sanitizer::escapeId( $value, 'noninitial' ); |
807
|
|
|
} |
808
|
|
|
|
809
|
|
|
# Escape HTML id reference lists |
810
|
|
|
if ( $attribute === 'aria-describedby' |
811
|
|
|
|| $attribute === 'aria-flowto' |
812
|
|
|
|| $attribute === 'aria-labelledby' |
813
|
|
|
|| $attribute === 'aria-owns' |
814
|
|
|
) { |
815
|
|
|
$value = Sanitizer::escapeIdReferenceList( $value, 'noninitial' ); |
816
|
|
|
} |
817
|
|
|
|
818
|
|
|
// RDFa and microdata properties allow URLs, URIs and/or CURIs. |
819
|
|
|
// Check them for sanity. |
820
|
|
|
if ( $attribute === 'rel' || $attribute === 'rev' |
821
|
|
|
# RDFa |
822
|
|
|
|| $attribute === 'about' || $attribute === 'property' |
823
|
|
|
|| $attribute === 'resource' || $attribute === 'datatype' |
824
|
|
|
|| $attribute === 'typeof' |
825
|
|
|
# HTML5 microdata |
826
|
|
|
|| $attribute === 'itemid' || $attribute === 'itemprop' |
827
|
|
|
|| $attribute === 'itemref' || $attribute === 'itemscope' |
828
|
|
|
|| $attribute === 'itemtype' |
829
|
|
|
) { |
830
|
|
|
// Paranoia. Allow "simple" values but suppress javascript |
831
|
|
|
if ( preg_match( self::EVIL_URI_PATTERN, $value ) ) { |
832
|
|
|
continue; |
833
|
|
|
} |
834
|
|
|
} |
835
|
|
|
|
836
|
|
|
# NOTE: even though elements using href/src are not allowed directly, supply |
837
|
|
|
# validation code that can be used by tag hook handlers, etc |
838
|
|
|
if ( $attribute === 'href' || $attribute === 'src' ) { |
839
|
|
|
if ( !preg_match( $hrefExp, $value ) ) { |
840
|
|
|
continue; // drop any href or src attributes not using an allowed protocol. |
841
|
|
|
// NOTE: this also drops all relative URLs |
842
|
|
|
} |
843
|
|
|
} |
844
|
|
|
|
845
|
|
|
// If this attribute was previously set, override it. |
846
|
|
|
// Output should only have one attribute of each name. |
847
|
|
|
$out[$attribute] = $value; |
848
|
|
|
} |
849
|
|
|
|
850
|
|
|
# itemtype, itemid, itemref don't make sense without itemscope |
851
|
|
|
if ( !array_key_exists( 'itemscope', $out ) ) { |
852
|
|
|
unset( $out['itemtype'] ); |
853
|
|
|
unset( $out['itemid'] ); |
854
|
|
|
unset( $out['itemref'] ); |
855
|
|
|
} |
856
|
|
|
# TODO: Strip itemprop if we aren't descendants of an itemscope or pointed to by an itemref. |
857
|
|
|
|
858
|
|
|
return $out; |
859
|
|
|
} |
860
|
|
|
|
861
|
|
|
/** |
862
|
|
|
* Merge two sets of HTML attributes. Conflicting items in the second set |
863
|
|
|
* will override those in the first, except for 'class' attributes which |
864
|
|
|
* will be combined (if they're both strings). |
865
|
|
|
* |
866
|
|
|
* @todo implement merging for other attributes such as style |
867
|
|
|
* @param array $a |
868
|
|
|
* @param array $b |
869
|
|
|
* @return array |
870
|
|
|
*/ |
871
|
|
|
static function mergeAttributes( $a, $b ) { |
872
|
|
|
$out = array_merge( $a, $b ); |
873
|
|
|
if ( isset( $a['class'] ) && isset( $b['class'] ) |
874
|
|
|
&& is_string( $a['class'] ) && is_string( $b['class'] ) |
875
|
|
|
&& $a['class'] !== $b['class'] |
876
|
|
|
) { |
877
|
|
|
$classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}", |
878
|
|
|
-1, PREG_SPLIT_NO_EMPTY ); |
879
|
|
|
$out['class'] = implode( ' ', array_unique( $classes ) ); |
880
|
|
|
} |
881
|
|
|
return $out; |
882
|
|
|
} |
883
|
|
|
|
884
|
|
|
/** |
885
|
|
|
* Normalize CSS into a format we can easily search for hostile input |
886
|
|
|
* - decode character references |
887
|
|
|
* - decode escape sequences |
888
|
|
|
* - convert characters that IE6 interprets into ascii |
889
|
|
|
* - remove comments, unless the entire value is one single comment |
890
|
|
|
* @param string $value the css string |
891
|
|
|
* @return string normalized css |
892
|
|
|
*/ |
893
|
|
|
public static function normalizeCss( $value ) { |
894
|
|
|
|
895
|
|
|
// Decode character references like { |
896
|
|
|
$value = Sanitizer::decodeCharReferences( $value ); |
897
|
|
|
|
898
|
|
|
// Decode escape sequences and line continuation |
899
|
|
|
// See the grammar in the CSS 2 spec, appendix D. |
900
|
|
|
// This has to be done AFTER decoding character references. |
901
|
|
|
// This means it isn't possible for this function to return |
902
|
|
|
// unsanitized escape sequences. It is possible to manufacture |
903
|
|
|
// input that contains character references that decode to |
904
|
|
|
// escape sequences that decode to character references, but |
905
|
|
|
// it's OK for the return value to contain character references |
906
|
|
|
// because the caller is supposed to escape those anyway. |
907
|
|
|
static $decodeRegex; |
908
|
|
|
if ( !$decodeRegex ) { |
909
|
|
|
$space = '[\\x20\\t\\r\\n\\f]'; |
910
|
|
|
$nl = '(?:\\n|\\r\\n|\\r|\\f)'; |
911
|
|
|
$backslash = '\\\\'; |
912
|
|
|
$decodeRegex = "/ $backslash |
913
|
|
|
(?: |
914
|
|
|
($nl) | # 1. Line continuation |
915
|
|
|
([0-9A-Fa-f]{1,6})$space? | # 2. character number |
916
|
|
|
(.) | # 3. backslash cancelling special meaning |
917
|
|
|
() | # 4. backslash at end of string |
918
|
|
|
)/xu"; |
919
|
|
|
} |
920
|
|
|
$value = preg_replace_callback( $decodeRegex, |
921
|
|
|
[ __CLASS__, 'cssDecodeCallback' ], $value ); |
922
|
|
|
|
923
|
|
|
// Normalize Halfwidth and Fullwidth Unicode block that IE6 might treat as ascii |
924
|
|
|
$value = preg_replace_callback( |
925
|
|
|
'/[!-[]-z]/u', // U+FF01 to U+FF5A, excluding U+FF3C (bug 58088) |
926
|
|
|
function ( $matches ) { |
927
|
|
|
$cp = UtfNormal\Utils::utf8ToCodepoint( $matches[0] ); |
928
|
|
|
if ( $cp === false ) { |
929
|
|
|
return ''; |
930
|
|
|
} |
931
|
|
|
return chr( $cp - 65248 ); // ASCII range \x21-\x7A |
932
|
|
|
}, |
933
|
|
|
$value |
934
|
|
|
); |
935
|
|
|
|
936
|
|
|
// Convert more characters IE6 might treat as ascii |
937
|
|
|
// U+0280, U+0274, U+207F, U+029F, U+026A, U+207D, U+208D |
938
|
|
|
$value = str_replace( |
939
|
|
|
[ 'ʀ', 'ɴ', 'ⁿ', 'ʟ', 'ɪ', '⁽', '₍' ], |
940
|
|
|
[ 'r', 'n', 'n', 'l', 'i', '(', '(' ], |
941
|
|
|
$value |
942
|
|
|
); |
943
|
|
|
|
944
|
|
|
// Let the value through if it's nothing but a single comment, to |
945
|
|
|
// allow other functions which may reject it to pass some error |
946
|
|
|
// message through. |
947
|
|
|
if ( !preg_match( '! ^ \s* /\* [^*\\/]* \*/ \s* $ !x', $value ) ) { |
948
|
|
|
// Remove any comments; IE gets token splitting wrong |
949
|
|
|
// This must be done AFTER decoding character references and |
950
|
|
|
// escape sequences, because those steps can introduce comments |
951
|
|
|
// This step cannot introduce character references or escape |
952
|
|
|
// sequences, because it replaces comments with spaces rather |
953
|
|
|
// than removing them completely. |
954
|
|
|
$value = StringUtils::delimiterReplace( '/*', '*/', ' ', $value ); |
955
|
|
|
|
956
|
|
|
// Remove anything after a comment-start token, to guard against |
957
|
|
|
// incorrect client implementations. |
958
|
|
|
$commentPos = strpos( $value, '/*' ); |
959
|
|
|
if ( $commentPos !== false ) { |
960
|
|
|
$value = substr( $value, 0, $commentPos ); |
961
|
|
|
} |
962
|
|
|
} |
963
|
|
|
|
964
|
|
|
// S followed by repeat, iteration, or prolonged sound marks, |
965
|
|
|
// which IE will treat as "ss" |
966
|
|
|
$value = preg_replace( |
967
|
|
|
'/s(?: |
968
|
|
|
\xE3\x80\xB1 | # U+3031 |
969
|
|
|
\xE3\x82\x9D | # U+309D |
970
|
|
|
\xE3\x83\xBC | # U+30FC |
971
|
|
|
\xE3\x83\xBD | # U+30FD |
972
|
|
|
\xEF\xB9\xBC | # U+FE7C |
973
|
|
|
\xEF\xB9\xBD | # U+FE7D |
974
|
|
|
\xEF\xBD\xB0 # U+FF70 |
975
|
|
|
)/ix', |
976
|
|
|
'ss', |
977
|
|
|
$value |
978
|
|
|
); |
979
|
|
|
|
980
|
|
|
return $value; |
981
|
|
|
} |
982
|
|
|
|
983
|
|
|
/** |
984
|
|
|
* Pick apart some CSS and check it for forbidden or unsafe structures. |
985
|
|
|
* Returns a sanitized string. This sanitized string will have |
986
|
|
|
* character references and escape sequences decoded and comments |
987
|
|
|
* stripped (unless it is itself one valid comment, in which case the value |
988
|
|
|
* will be passed through). If the input is just too evil, only a comment |
989
|
|
|
* complaining about evilness will be returned. |
990
|
|
|
* |
991
|
|
|
* Currently URL references, 'expression', 'tps' are forbidden. |
992
|
|
|
* |
993
|
|
|
* NOTE: Despite the fact that character references are decoded, the |
994
|
|
|
* returned string may contain character references given certain |
995
|
|
|
* clever input strings. These character references must |
996
|
|
|
* be escaped before the return value is embedded in HTML. |
997
|
|
|
* |
998
|
|
|
* @param string $value |
999
|
|
|
* @return string |
1000
|
|
|
*/ |
1001
|
|
|
static function checkCss( $value ) { |
1002
|
|
|
$value = self::normalizeCss( $value ); |
1003
|
|
|
|
1004
|
|
|
// Reject problematic keywords and control characters |
1005
|
|
|
if ( preg_match( '/[\000-\010\013\016-\037\177]/', $value ) || |
1006
|
|
|
strpos( $value, UtfNormal\Constants::UTF8_REPLACEMENT ) !== false ) { |
1007
|
|
|
return '/* invalid control char */'; |
1008
|
|
|
} elseif ( preg_match( |
1009
|
|
|
'! expression |
1010
|
|
|
| filter\s*: |
1011
|
|
|
| accelerator\s*: |
1012
|
|
|
| -o-link\s*: |
1013
|
|
|
| -o-link-source\s*: |
1014
|
|
|
| -o-replace\s*: |
1015
|
|
|
| url\s*\( |
1016
|
|
|
| image\s*\( |
1017
|
|
|
| image-set\s*\( |
1018
|
|
|
| attr\s*\([^)]+[\s,]+url |
1019
|
|
|
!ix', $value ) ) { |
1020
|
|
|
return '/* insecure input */'; |
1021
|
|
|
} |
1022
|
|
|
return $value; |
1023
|
|
|
} |
1024
|
|
|
|
1025
|
|
|
/** |
1026
|
|
|
* @param array $matches |
1027
|
|
|
* @return string |
1028
|
|
|
*/ |
1029
|
|
|
static function cssDecodeCallback( $matches ) { |
1030
|
|
|
if ( $matches[1] !== '' ) { |
1031
|
|
|
// Line continuation |
1032
|
|
|
return ''; |
1033
|
|
|
} elseif ( $matches[2] !== '' ) { |
1034
|
|
|
$char = UtfNormal\Utils::codepointToUtf8( hexdec( $matches[2] ) ); |
1035
|
|
|
} elseif ( $matches[3] !== '' ) { |
1036
|
|
|
$char = $matches[3]; |
1037
|
|
|
} else { |
1038
|
|
|
$char = '\\'; |
1039
|
|
|
} |
1040
|
|
|
if ( $char == "\n" || $char == '"' || $char == "'" || $char == '\\' ) { |
1041
|
|
|
// These characters need to be escaped in strings |
1042
|
|
|
// Clean up the escape sequence to avoid parsing errors by clients |
1043
|
|
|
return '\\' . dechex( ord( $char ) ) . ' '; |
1044
|
|
|
} else { |
1045
|
|
|
// Decode unnecessary escape |
1046
|
|
|
return $char; |
1047
|
|
|
} |
1048
|
|
|
} |
1049
|
|
|
|
1050
|
|
|
/** |
1051
|
|
|
* Take a tag soup fragment listing an HTML element's attributes |
1052
|
|
|
* and normalize it to well-formed XML, discarding unwanted attributes. |
1053
|
|
|
* Output is safe for further wikitext processing, with escaping of |
1054
|
|
|
* values that could trigger problems. |
1055
|
|
|
* |
1056
|
|
|
* - Normalizes attribute names to lowercase |
1057
|
|
|
* - Discards attributes not on a whitelist for the given element |
1058
|
|
|
* - Turns broken or invalid entities into plaintext |
1059
|
|
|
* - Double-quotes all attribute values |
1060
|
|
|
* - Attributes without values are given the name as attribute |
1061
|
|
|
* - Double attributes are discarded |
1062
|
|
|
* - Unsafe style attributes are discarded |
1063
|
|
|
* - Prepends space if there are attributes. |
1064
|
|
|
* - (Optionally) Sorts attributes by name. |
1065
|
|
|
* |
1066
|
|
|
* @param string $text |
1067
|
|
|
* @param string $element |
1068
|
|
|
* @param bool $sorted Whether to sort the attributes (default: false) |
1069
|
|
|
* @return string |
1070
|
|
|
*/ |
1071
|
|
|
static function fixTagAttributes( $text, $element, $sorted = false ) { |
1072
|
|
|
if ( trim( $text ) == '' ) { |
1073
|
|
|
return ''; |
1074
|
|
|
} |
1075
|
|
|
|
1076
|
|
|
$decoded = Sanitizer::decodeTagAttributes( $text ); |
1077
|
|
|
$stripped = Sanitizer::validateTagAttributes( $decoded, $element ); |
1078
|
|
|
|
1079
|
|
|
if ( $sorted ) { |
1080
|
|
|
ksort( $stripped ); |
1081
|
|
|
} |
1082
|
|
|
|
1083
|
|
|
return Sanitizer::safeEncodeTagAttributes( $stripped ); |
1084
|
|
|
} |
1085
|
|
|
|
1086
|
|
|
/** |
1087
|
|
|
* Encode an attribute value for HTML output. |
1088
|
|
|
* @param string $text |
1089
|
|
|
* @return string HTML-encoded text fragment |
1090
|
|
|
*/ |
1091
|
|
|
static function encodeAttribute( $text ) { |
1092
|
|
|
$encValue = htmlspecialchars( $text, ENT_QUOTES ); |
1093
|
|
|
|
1094
|
|
|
// Whitespace is normalized during attribute decoding, |
1095
|
|
|
// so if we've been passed non-spaces we must encode them |
1096
|
|
|
// ahead of time or they won't be preserved. |
1097
|
|
|
$encValue = strtr( $encValue, [ |
1098
|
|
|
"\n" => ' ', |
1099
|
|
|
"\r" => ' ', |
1100
|
|
|
"\t" => '	', |
1101
|
|
|
] ); |
1102
|
|
|
|
1103
|
|
|
return $encValue; |
1104
|
|
|
} |
1105
|
|
|
|
1106
|
|
|
/** |
1107
|
|
|
* Encode an attribute value for HTML tags, with extra armoring |
1108
|
|
|
* against further wiki processing. |
1109
|
|
|
* @param string $text |
1110
|
|
|
* @return string HTML-encoded text fragment |
1111
|
|
|
*/ |
1112
|
|
|
static function safeEncodeAttribute( $text ) { |
1113
|
|
|
$encValue = Sanitizer::encodeAttribute( $text ); |
1114
|
|
|
|
1115
|
|
|
# Templates and links may be expanded in later parsing, |
1116
|
|
|
# creating invalid or dangerous output. Suppress this. |
1117
|
|
|
$encValue = strtr( $encValue, [ |
1118
|
|
|
'<' => '<', // This should never happen, |
1119
|
|
|
'>' => '>', // we've received invalid input |
1120
|
|
|
'"' => '"', // which should have been escaped. |
1121
|
|
|
'{' => '{', |
1122
|
|
|
'[' => '[', |
1123
|
|
|
"''" => '''', |
1124
|
|
|
'ISBN' => 'ISBN', |
1125
|
|
|
'RFC' => 'RFC', |
1126
|
|
|
'PMID' => 'PMID', |
1127
|
|
|
'|' => '|', |
1128
|
|
|
'__' => '__', |
1129
|
|
|
] ); |
1130
|
|
|
|
1131
|
|
|
# Stupid hack |
1132
|
|
|
$encValue = preg_replace_callback( |
1133
|
|
|
'/((?i)' . wfUrlProtocols() . ')/', |
1134
|
|
|
[ 'Sanitizer', 'armorLinksCallback' ], |
1135
|
|
|
$encValue ); |
1136
|
|
|
return $encValue; |
1137
|
|
|
} |
1138
|
|
|
|
1139
|
|
|
/** |
1140
|
|
|
* Given a value, escape it so that it can be used in an id attribute and |
1141
|
|
|
* return it. This will use HTML5 validation if $wgExperimentalHtmlIds is |
1142
|
|
|
* true, allowing anything but ASCII whitespace. Otherwise it will use |
1143
|
|
|
* HTML 4 rules, which means a narrow subset of ASCII, with bad characters |
1144
|
|
|
* escaped with lots of dots. |
1145
|
|
|
* |
1146
|
|
|
* To ensure we don't have to bother escaping anything, we also strip ', ", |
1147
|
|
|
* & even if $wgExperimentalIds is true. TODO: Is this the best tactic? |
1148
|
|
|
* We also strip # because it upsets IE, and % because it could be |
1149
|
|
|
* ambiguous if it's part of something that looks like a percent escape |
1150
|
|
|
* (which don't work reliably in fragments cross-browser). |
1151
|
|
|
* |
1152
|
|
|
* @see https://www.w3.org/TR/html401/types.html#type-name Valid characters |
1153
|
|
|
* in the id and name attributes |
1154
|
|
|
* @see https://www.w3.org/TR/html401/struct/links.html#h-12.2.3 Anchors with |
1155
|
|
|
* the id attribute |
1156
|
|
|
* @see https://www.w3.org/TR/html5/dom.html#the-id-attribute |
1157
|
|
|
* HTML5 definition of id attribute |
1158
|
|
|
* |
1159
|
|
|
* @param string $id Id to escape |
1160
|
|
|
* @param string|array $options String or array of strings (default is array()): |
1161
|
|
|
* 'noninitial': This is a non-initial fragment of an id, not a full id, |
1162
|
|
|
* so don't pay attention if the first character isn't valid at the |
1163
|
|
|
* beginning of an id. Only matters if $wgExperimentalHtmlIds is |
1164
|
|
|
* false. |
1165
|
|
|
* 'legacy': Behave the way the old HTML 4-based ID escaping worked even |
1166
|
|
|
* if $wgExperimentalHtmlIds is used, so we can generate extra |
1167
|
|
|
* anchors and links won't break. |
1168
|
|
|
* @return string |
1169
|
|
|
*/ |
1170
|
|
|
static function escapeId( $id, $options = [] ) { |
1171
|
|
|
global $wgExperimentalHtmlIds; |
1172
|
|
|
$options = (array)$options; |
1173
|
|
|
|
1174
|
|
|
$id = Sanitizer::decodeCharReferences( $id ); |
1175
|
|
|
|
1176
|
|
|
if ( $wgExperimentalHtmlIds && !in_array( 'legacy', $options ) ) { |
1177
|
|
|
$id = preg_replace( '/[ \t\n\r\f_\'"&#%]+/', '_', $id ); |
1178
|
|
|
$id = trim( $id, '_' ); |
1179
|
|
|
if ( $id === '' ) { |
1180
|
|
|
// Must have been all whitespace to start with. |
1181
|
|
|
return '_'; |
1182
|
|
|
} else { |
1183
|
|
|
return $id; |
1184
|
|
|
} |
1185
|
|
|
} |
1186
|
|
|
|
1187
|
|
|
// HTML4-style escaping |
1188
|
|
|
static $replace = [ |
1189
|
|
|
'%3A' => ':', |
1190
|
|
|
'%' => '.' |
1191
|
|
|
]; |
1192
|
|
|
|
1193
|
|
|
$id = urlencode( strtr( $id, ' ', '_' ) ); |
1194
|
|
|
$id = str_replace( array_keys( $replace ), array_values( $replace ), $id ); |
1195
|
|
|
|
1196
|
|
|
if ( !preg_match( '/^[a-zA-Z]/', $id ) && !in_array( 'noninitial', $options ) ) { |
1197
|
|
|
// Initial character must be a letter! |
1198
|
|
|
$id = "x$id"; |
1199
|
|
|
} |
1200
|
|
|
return $id; |
1201
|
|
|
} |
1202
|
|
|
|
1203
|
|
|
/** |
1204
|
|
|
* Given a string containing a space delimited list of ids, escape each id |
1205
|
|
|
* to match ids escaped by the escapeId() function. |
1206
|
|
|
* |
1207
|
|
|
* @since 1.27 |
1208
|
|
|
* |
1209
|
|
|
* @param string $referenceString Space delimited list of ids |
1210
|
|
|
* @param string|array $options String or array of strings (default is array()): |
1211
|
|
|
* 'noninitial': This is a non-initial fragment of an id, not a full id, |
1212
|
|
|
* so don't pay attention if the first character isn't valid at the |
1213
|
|
|
* beginning of an id. Only matters if $wgExperimentalHtmlIds is |
1214
|
|
|
* false. |
1215
|
|
|
* 'legacy': Behave the way the old HTML 4-based ID escaping worked even |
1216
|
|
|
* if $wgExperimentalHtmlIds is used, so we can generate extra |
1217
|
|
|
* anchors and links won't break. |
1218
|
|
|
* @return string |
1219
|
|
|
*/ |
1220
|
|
|
static function escapeIdReferenceList( $referenceString, $options = [] ) { |
1221
|
|
|
# Explode the space delimited list string into an array of tokens |
1222
|
|
|
$references = preg_split( '/\s+/', "{$referenceString}", -1, PREG_SPLIT_NO_EMPTY ); |
1223
|
|
|
|
1224
|
|
|
# Escape each token as an id |
1225
|
|
|
foreach ( $references as &$ref ) { |
1226
|
|
|
$ref = Sanitizer::escapeId( $ref, $options ); |
1227
|
|
|
} |
1228
|
|
|
|
1229
|
|
|
# Merge the array back to a space delimited list string |
1230
|
|
|
# If the array is empty, the result will be an empty string ('') |
1231
|
|
|
$referenceString = implode( ' ', $references ); |
1232
|
|
|
|
1233
|
|
|
return $referenceString; |
1234
|
|
|
} |
1235
|
|
|
|
1236
|
|
|
/** |
1237
|
|
|
* Given a value, escape it so that it can be used as a CSS class and |
1238
|
|
|
* return it. |
1239
|
|
|
* |
1240
|
|
|
* @todo For extra validity, input should be validated UTF-8. |
1241
|
|
|
* |
1242
|
|
|
* @see https://www.w3.org/TR/CSS21/syndata.html Valid characters/format |
1243
|
|
|
* |
1244
|
|
|
* @param string $class |
1245
|
|
|
* @return string |
1246
|
|
|
*/ |
1247
|
|
|
static function escapeClass( $class ) { |
1248
|
|
|
// Convert ugly stuff to underscores and kill underscores in ugly places |
1249
|
|
|
return rtrim( preg_replace( |
1250
|
|
|
[ '/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/', '/_+/' ], |
1251
|
|
|
'_', |
1252
|
|
|
$class ), '_' ); |
1253
|
|
|
} |
1254
|
|
|
|
1255
|
|
|
/** |
1256
|
|
|
* Given HTML input, escape with htmlspecialchars but un-escape entities. |
1257
|
|
|
* This allows (generally harmless) entities like   to survive. |
1258
|
|
|
* |
1259
|
|
|
* @param string $html HTML to escape |
1260
|
|
|
* @return string Escaped input |
1261
|
|
|
*/ |
1262
|
|
|
static function escapeHtmlAllowEntities( $html ) { |
1263
|
|
|
$html = Sanitizer::decodeCharReferences( $html ); |
1264
|
|
|
# It seems wise to escape ' as well as ", as a matter of course. Can't |
1265
|
|
|
# hurt. |
1266
|
|
|
$html = htmlspecialchars( $html, ENT_QUOTES ); |
1267
|
|
|
return $html; |
1268
|
|
|
} |
1269
|
|
|
|
1270
|
|
|
/** |
1271
|
|
|
* Regex replace callback for armoring links against further processing. |
1272
|
|
|
* @param array $matches |
1273
|
|
|
* @return string |
1274
|
|
|
*/ |
1275
|
|
|
private static function armorLinksCallback( $matches ) { |
1276
|
|
|
return str_replace( ':', ':', $matches[1] ); |
1277
|
|
|
} |
1278
|
|
|
|
1279
|
|
|
/** |
1280
|
|
|
* Return an associative array of attribute names and values from |
1281
|
|
|
* a partial tag string. Attribute names are forced to lowercase, |
1282
|
|
|
* character references are decoded to UTF-8 text. |
1283
|
|
|
* |
1284
|
|
|
* @param string $text |
1285
|
|
|
* @return array |
1286
|
|
|
*/ |
1287
|
|
|
public static function decodeTagAttributes( $text ) { |
1288
|
|
|
if ( trim( $text ) == '' ) { |
1289
|
|
|
return []; |
1290
|
|
|
} |
1291
|
|
|
|
1292
|
|
|
$attribs = []; |
1293
|
|
|
$pairs = []; |
1294
|
|
|
if ( !preg_match_all( |
1295
|
|
|
self::getAttribsRegex(), |
1296
|
|
|
$text, |
1297
|
|
|
$pairs, |
1298
|
|
|
PREG_SET_ORDER ) ) { |
1299
|
|
|
return $attribs; |
1300
|
|
|
} |
1301
|
|
|
|
1302
|
|
|
foreach ( $pairs as $set ) { |
1303
|
|
|
$attribute = strtolower( $set[1] ); |
1304
|
|
|
$value = Sanitizer::getTagAttributeCallback( $set ); |
1305
|
|
|
|
1306
|
|
|
// Normalize whitespace |
1307
|
|
|
$value = preg_replace( '/[\t\r\n ]+/', ' ', $value ); |
1308
|
|
|
$value = trim( $value ); |
1309
|
|
|
|
1310
|
|
|
// Decode character references |
1311
|
|
|
$attribs[$attribute] = Sanitizer::decodeCharReferences( $value ); |
1312
|
|
|
} |
1313
|
|
|
return $attribs; |
1314
|
|
|
} |
1315
|
|
|
|
1316
|
|
|
/** |
1317
|
|
|
* Build a partial tag string from an associative array of attribute |
1318
|
|
|
* names and values as returned by decodeTagAttributes. |
1319
|
|
|
* |
1320
|
|
|
* @param array $assoc_array |
1321
|
|
|
* @return string |
1322
|
|
|
*/ |
1323
|
|
|
public static function safeEncodeTagAttributes( $assoc_array ) { |
1324
|
|
|
$attribs = []; |
1325
|
|
|
foreach ( $assoc_array as $attribute => $value ) { |
1326
|
|
|
$encAttribute = htmlspecialchars( $attribute ); |
1327
|
|
|
$encValue = Sanitizer::safeEncodeAttribute( $value ); |
1328
|
|
|
|
1329
|
|
|
$attribs[] = "$encAttribute=\"$encValue\""; |
1330
|
|
|
} |
1331
|
|
|
return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : ''; |
1332
|
|
|
} |
1333
|
|
|
|
1334
|
|
|
/** |
1335
|
|
|
* Pick the appropriate attribute value from a match set from the |
1336
|
|
|
* attribs regex matches. |
1337
|
|
|
* |
1338
|
|
|
* @param array $set |
1339
|
|
|
* @throws MWException When tag conditions are not met. |
1340
|
|
|
* @return string |
1341
|
|
|
*/ |
1342
|
|
|
private static function getTagAttributeCallback( $set ) { |
1343
|
|
|
if ( isset( $set[5] ) ) { |
1344
|
|
|
# No quotes. |
1345
|
|
|
return $set[5]; |
1346
|
|
|
} elseif ( isset( $set[4] ) ) { |
1347
|
|
|
# Single-quoted |
1348
|
|
|
return $set[4]; |
1349
|
|
|
} elseif ( isset( $set[3] ) ) { |
1350
|
|
|
# Double-quoted |
1351
|
|
|
return $set[3]; |
1352
|
|
|
} elseif ( !isset( $set[2] ) ) { |
1353
|
|
|
# In XHTML, attributes must have a value so return an empty string. |
1354
|
|
|
# See "Empty attribute syntax", |
1355
|
|
|
# https://www.w3.org/TR/html5/syntax.html#syntax-attribute-name |
1356
|
|
|
return ""; |
1357
|
|
|
} else { |
1358
|
|
|
throw new MWException( "Tag conditions not met. This should never happen and is a bug." ); |
1359
|
|
|
} |
1360
|
|
|
} |
1361
|
|
|
|
1362
|
|
|
/** |
1363
|
|
|
* @param string $text |
1364
|
|
|
* @return string |
1365
|
|
|
*/ |
1366
|
|
|
private static function normalizeWhitespace( $text ) { |
1367
|
|
|
return preg_replace( |
1368
|
|
|
'/\r\n|[\x20\x0d\x0a\x09]/', |
1369
|
|
|
' ', |
1370
|
|
|
$text ); |
1371
|
|
|
} |
1372
|
|
|
|
1373
|
|
|
/** |
1374
|
|
|
* Normalizes whitespace in a section name, such as might be returned |
1375
|
|
|
* by Parser::stripSectionName(), for use in the id's that are used for |
1376
|
|
|
* section links. |
1377
|
|
|
* |
1378
|
|
|
* @param string $section |
1379
|
|
|
* @return string |
1380
|
|
|
*/ |
1381
|
|
|
static function normalizeSectionNameWhitespace( $section ) { |
1382
|
|
|
return trim( preg_replace( '/[ _]+/', ' ', $section ) ); |
1383
|
|
|
} |
1384
|
|
|
|
1385
|
|
|
/** |
1386
|
|
|
* Ensure that any entities and character references are legal |
1387
|
|
|
* for XML and XHTML specifically. Any stray bits will be |
1388
|
|
|
* &-escaped to result in a valid text fragment. |
1389
|
|
|
* |
1390
|
|
|
* a. named char refs can only be < > & ", others are |
1391
|
|
|
* numericized (this way we're well-formed even without a DTD) |
1392
|
|
|
* b. any numeric char refs must be legal chars, not invalid or forbidden |
1393
|
|
|
* c. use lower cased "&#x", not "&#X" |
1394
|
|
|
* d. fix or reject non-valid attributes |
1395
|
|
|
* |
1396
|
|
|
* @param string $text |
1397
|
|
|
* @return string |
1398
|
|
|
* @private |
1399
|
|
|
*/ |
1400
|
|
|
static function normalizeCharReferences( $text ) { |
1401
|
|
|
return preg_replace_callback( |
1402
|
|
|
self::CHAR_REFS_REGEX, |
1403
|
|
|
[ 'Sanitizer', 'normalizeCharReferencesCallback' ], |
1404
|
|
|
$text ); |
1405
|
|
|
} |
1406
|
|
|
|
1407
|
|
|
/** |
1408
|
|
|
* @param string $matches |
1409
|
|
|
* @return string |
1410
|
|
|
*/ |
1411
|
|
|
static function normalizeCharReferencesCallback( $matches ) { |
1412
|
|
|
$ret = null; |
1413
|
|
|
if ( $matches[1] != '' ) { |
1414
|
|
|
$ret = Sanitizer::normalizeEntity( $matches[1] ); |
1415
|
|
|
} elseif ( $matches[2] != '' ) { |
1416
|
|
|
$ret = Sanitizer::decCharReference( $matches[2] ); |
1417
|
|
|
} elseif ( $matches[3] != '' ) { |
1418
|
|
|
$ret = Sanitizer::hexCharReference( $matches[3] ); |
1419
|
|
|
} |
1420
|
|
|
if ( is_null( $ret ) ) { |
1421
|
|
|
return htmlspecialchars( $matches[0] ); |
1422
|
|
|
} else { |
1423
|
|
|
return $ret; |
1424
|
|
|
} |
1425
|
|
|
} |
1426
|
|
|
|
1427
|
|
|
/** |
1428
|
|
|
* If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, |
1429
|
|
|
* return the equivalent numeric entity reference (except for the core < |
1430
|
|
|
* > & "). If the entity is a MediaWiki-specific alias, returns |
1431
|
|
|
* the HTML equivalent. Otherwise, returns HTML-escaped text of |
1432
|
|
|
* pseudo-entity source (eg &foo;) |
1433
|
|
|
* |
1434
|
|
|
* @param string $name |
1435
|
|
|
* @return string |
1436
|
|
|
*/ |
1437
|
|
|
static function normalizeEntity( $name ) { |
1438
|
|
|
if ( isset( self::$htmlEntityAliases[$name] ) ) { |
1439
|
|
|
return '&' . self::$htmlEntityAliases[$name] . ';'; |
1440
|
|
|
} elseif ( in_array( $name, [ 'lt', 'gt', 'amp', 'quot' ] ) ) { |
1441
|
|
|
return "&$name;"; |
1442
|
|
|
} elseif ( isset( self::$htmlEntities[$name] ) ) { |
1443
|
|
|
return '&#' . self::$htmlEntities[$name] . ';'; |
1444
|
|
|
} else { |
1445
|
|
|
return "&$name;"; |
1446
|
|
|
} |
1447
|
|
|
} |
1448
|
|
|
|
1449
|
|
|
/** |
1450
|
|
|
* @param int $codepoint |
1451
|
|
|
* @return null|string |
1452
|
|
|
*/ |
1453
|
|
View Code Duplication |
static function decCharReference( $codepoint ) { |
1454
|
|
|
$point = intval( $codepoint ); |
1455
|
|
|
if ( Sanitizer::validateCodepoint( $point ) ) { |
1456
|
|
|
return sprintf( '&#%d;', $point ); |
1457
|
|
|
} else { |
1458
|
|
|
return null; |
1459
|
|
|
} |
1460
|
|
|
} |
1461
|
|
|
|
1462
|
|
|
/** |
1463
|
|
|
* @param int $codepoint |
1464
|
|
|
* @return null|string |
1465
|
|
|
*/ |
1466
|
|
View Code Duplication |
static function hexCharReference( $codepoint ) { |
1467
|
|
|
$point = hexdec( $codepoint ); |
1468
|
|
|
if ( Sanitizer::validateCodepoint( $point ) ) { |
1469
|
|
|
return sprintf( '&#x%x;', $point ); |
1470
|
|
|
} else { |
1471
|
|
|
return null; |
1472
|
|
|
} |
1473
|
|
|
} |
1474
|
|
|
|
1475
|
|
|
/** |
1476
|
|
|
* Returns true if a given Unicode codepoint is a valid character in |
1477
|
|
|
* both HTML5 and XML. |
1478
|
|
|
* @param int $codepoint |
1479
|
|
|
* @return bool |
1480
|
|
|
*/ |
1481
|
|
|
private static function validateCodepoint( $codepoint ) { |
1482
|
|
|
# U+000C is valid in HTML5 but not allowed in XML. |
1483
|
|
|
# U+000D is valid in XML but not allowed in HTML5. |
1484
|
|
|
# U+007F - U+009F are disallowed in HTML5 (control characters). |
1485
|
|
|
return $codepoint == 0x09 |
1486
|
|
|
|| $codepoint == 0x0a |
1487
|
|
|
|| ( $codepoint >= 0x20 && $codepoint <= 0x7e ) |
1488
|
|
|
|| ( $codepoint >= 0xa0 && $codepoint <= 0xd7ff ) |
1489
|
|
|
|| ( $codepoint >= 0xe000 && $codepoint <= 0xfffd ) |
1490
|
|
|
|| ( $codepoint >= 0x10000 && $codepoint <= 0x10ffff ); |
1491
|
|
|
} |
1492
|
|
|
|
1493
|
|
|
/** |
1494
|
|
|
* Decode any character references, numeric or named entities, |
1495
|
|
|
* in the text and return a UTF-8 string. |
1496
|
|
|
* |
1497
|
|
|
* @param string $text |
1498
|
|
|
* @return string |
1499
|
|
|
*/ |
1500
|
|
|
public static function decodeCharReferences( $text ) { |
1501
|
|
|
return preg_replace_callback( |
1502
|
|
|
self::CHAR_REFS_REGEX, |
1503
|
|
|
[ 'Sanitizer', 'decodeCharReferencesCallback' ], |
1504
|
|
|
$text ); |
1505
|
|
|
} |
1506
|
|
|
|
1507
|
|
|
/** |
1508
|
|
|
* Decode any character references, numeric or named entities, |
1509
|
|
|
* in the next and normalize the resulting string. (bug 14952) |
1510
|
|
|
* |
1511
|
|
|
* This is useful for page titles, not for text to be displayed, |
1512
|
|
|
* MediaWiki allows HTML entities to escape normalization as a feature. |
1513
|
|
|
* |
1514
|
|
|
* @param string $text Already normalized, containing entities |
1515
|
|
|
* @return string Still normalized, without entities |
1516
|
|
|
*/ |
1517
|
|
|
public static function decodeCharReferencesAndNormalize( $text ) { |
1518
|
|
|
global $wgContLang; |
1519
|
|
|
$text = preg_replace_callback( |
1520
|
|
|
self::CHAR_REFS_REGEX, |
1521
|
|
|
[ 'Sanitizer', 'decodeCharReferencesCallback' ], |
1522
|
|
|
$text, /* limit */ -1, $count ); |
1523
|
|
|
|
1524
|
|
|
if ( $count ) { |
|
|
|
|
1525
|
|
|
return $wgContLang->normalize( $text ); |
1526
|
|
|
} else { |
1527
|
|
|
return $text; |
1528
|
|
|
} |
1529
|
|
|
} |
1530
|
|
|
|
1531
|
|
|
/** |
1532
|
|
|
* @param string $matches |
1533
|
|
|
* @return string |
1534
|
|
|
*/ |
1535
|
|
|
static function decodeCharReferencesCallback( $matches ) { |
1536
|
|
|
if ( $matches[1] != '' ) { |
1537
|
|
|
return Sanitizer::decodeEntity( $matches[1] ); |
1538
|
|
|
} elseif ( $matches[2] != '' ) { |
1539
|
|
|
return Sanitizer::decodeChar( intval( $matches[2] ) ); |
1540
|
|
|
} elseif ( $matches[3] != '' ) { |
1541
|
|
|
return Sanitizer::decodeChar( hexdec( $matches[3] ) ); |
1542
|
|
|
} |
1543
|
|
|
# Last case should be an ampersand by itself |
1544
|
|
|
return $matches[0]; |
1545
|
|
|
} |
1546
|
|
|
|
1547
|
|
|
/** |
1548
|
|
|
* Return UTF-8 string for a codepoint if that is a valid |
1549
|
|
|
* character reference, otherwise U+FFFD REPLACEMENT CHARACTER. |
1550
|
|
|
* @param int $codepoint |
1551
|
|
|
* @return string |
1552
|
|
|
* @private |
1553
|
|
|
*/ |
1554
|
|
|
static function decodeChar( $codepoint ) { |
1555
|
|
|
if ( Sanitizer::validateCodepoint( $codepoint ) ) { |
1556
|
|
|
return UtfNormal\Utils::codepointToUtf8( $codepoint ); |
1557
|
|
|
} else { |
1558
|
|
|
return UtfNormal\Constants::UTF8_REPLACEMENT; |
1559
|
|
|
} |
1560
|
|
|
} |
1561
|
|
|
|
1562
|
|
|
/** |
1563
|
|
|
* If the named entity is defined in the HTML 4.0/XHTML 1.0 DTD, |
1564
|
|
|
* return the UTF-8 encoding of that character. Otherwise, returns |
1565
|
|
|
* pseudo-entity source (eg "&foo;") |
1566
|
|
|
* |
1567
|
|
|
* @param string $name |
1568
|
|
|
* @return string |
1569
|
|
|
*/ |
1570
|
|
|
static function decodeEntity( $name ) { |
1571
|
|
|
if ( isset( self::$htmlEntityAliases[$name] ) ) { |
1572
|
|
|
$name = self::$htmlEntityAliases[$name]; |
1573
|
|
|
} |
1574
|
|
|
if ( isset( self::$htmlEntities[$name] ) ) { |
1575
|
|
|
return UtfNormal\Utils::codepointToUtf8( self::$htmlEntities[$name] ); |
1576
|
|
|
} else { |
1577
|
|
|
return "&$name;"; |
1578
|
|
|
} |
1579
|
|
|
} |
1580
|
|
|
|
1581
|
|
|
/** |
1582
|
|
|
* Fetch the whitelist of acceptable attributes for a given element name. |
1583
|
|
|
* |
1584
|
|
|
* @param string $element |
1585
|
|
|
* @return array |
1586
|
|
|
*/ |
1587
|
|
|
static function attributeWhitelist( $element ) { |
1588
|
|
|
$list = Sanitizer::setupAttributeWhitelist(); |
1589
|
|
|
return isset( $list[$element] ) |
1590
|
|
|
? $list[$element] |
1591
|
|
|
: []; |
1592
|
|
|
} |
1593
|
|
|
|
1594
|
|
|
/** |
1595
|
|
|
* Foreach array key (an allowed HTML element), return an array |
1596
|
|
|
* of allowed attributes |
1597
|
|
|
* @return array |
1598
|
|
|
*/ |
1599
|
|
|
static function setupAttributeWhitelist() { |
1600
|
|
|
static $whitelist; |
1601
|
|
|
|
1602
|
|
|
if ( $whitelist !== null ) { |
1603
|
|
|
return $whitelist; |
1604
|
|
|
} |
1605
|
|
|
|
1606
|
|
|
$common = [ |
1607
|
|
|
# HTML |
1608
|
|
|
'id', |
1609
|
|
|
'class', |
1610
|
|
|
'style', |
1611
|
|
|
'lang', |
1612
|
|
|
'dir', |
1613
|
|
|
'title', |
1614
|
|
|
|
1615
|
|
|
# WAI-ARIA |
1616
|
|
|
'aria-describedby', |
1617
|
|
|
'aria-flowto', |
1618
|
|
|
'aria-label', |
1619
|
|
|
'aria-labelledby', |
1620
|
|
|
'aria-owns', |
1621
|
|
|
'role', |
1622
|
|
|
|
1623
|
|
|
# RDFa |
1624
|
|
|
# These attributes are specified in section 9 of |
1625
|
|
|
# https://www.w3.org/TR/2008/REC-rdfa-syntax-20081014 |
1626
|
|
|
'about', |
1627
|
|
|
'property', |
1628
|
|
|
'resource', |
1629
|
|
|
'datatype', |
1630
|
|
|
'typeof', |
1631
|
|
|
|
1632
|
|
|
# Microdata. These are specified by |
1633
|
|
|
# https://html.spec.whatwg.org/multipage/microdata.html#the-microdata-model |
1634
|
|
|
'itemid', |
1635
|
|
|
'itemprop', |
1636
|
|
|
'itemref', |
1637
|
|
|
'itemscope', |
1638
|
|
|
'itemtype', |
1639
|
|
|
]; |
1640
|
|
|
|
1641
|
|
|
$block = array_merge( $common, [ 'align' ] ); |
1642
|
|
|
$tablealign = [ 'align', 'valign' ]; |
1643
|
|
|
$tablecell = [ |
1644
|
|
|
'abbr', |
1645
|
|
|
'axis', |
1646
|
|
|
'headers', |
1647
|
|
|
'scope', |
1648
|
|
|
'rowspan', |
1649
|
|
|
'colspan', |
1650
|
|
|
'nowrap', # deprecated |
1651
|
|
|
'width', # deprecated |
1652
|
|
|
'height', # deprecated |
1653
|
|
|
'bgcolor', # deprecated |
1654
|
|
|
]; |
1655
|
|
|
|
1656
|
|
|
# Numbers refer to sections in HTML 4.01 standard describing the element. |
1657
|
|
|
# See: https://www.w3.org/TR/html4/ |
1658
|
|
|
$whitelist = [ |
1659
|
|
|
# 7.5.4 |
1660
|
|
|
'div' => $block, |
1661
|
|
|
'center' => $common, # deprecated |
1662
|
|
|
'span' => $common, |
1663
|
|
|
|
1664
|
|
|
# 7.5.5 |
1665
|
|
|
'h1' => $block, |
1666
|
|
|
'h2' => $block, |
1667
|
|
|
'h3' => $block, |
1668
|
|
|
'h4' => $block, |
1669
|
|
|
'h5' => $block, |
1670
|
|
|
'h6' => $block, |
1671
|
|
|
|
1672
|
|
|
# 7.5.6 |
1673
|
|
|
# address |
1674
|
|
|
|
1675
|
|
|
# 8.2.4 |
1676
|
|
|
'bdo' => $common, |
1677
|
|
|
|
1678
|
|
|
# 9.2.1 |
1679
|
|
|
'em' => $common, |
1680
|
|
|
'strong' => $common, |
1681
|
|
|
'cite' => $common, |
1682
|
|
|
'dfn' => $common, |
1683
|
|
|
'code' => $common, |
1684
|
|
|
'samp' => $common, |
1685
|
|
|
'kbd' => $common, |
1686
|
|
|
'var' => $common, |
1687
|
|
|
'abbr' => $common, |
1688
|
|
|
# acronym |
1689
|
|
|
|
1690
|
|
|
# 9.2.2 |
1691
|
|
|
'blockquote' => array_merge( $common, [ 'cite' ] ), |
1692
|
|
|
'q' => array_merge( $common, [ 'cite' ] ), |
1693
|
|
|
|
1694
|
|
|
# 9.2.3 |
1695
|
|
|
'sub' => $common, |
1696
|
|
|
'sup' => $common, |
1697
|
|
|
|
1698
|
|
|
# 9.3.1 |
1699
|
|
|
'p' => $block, |
1700
|
|
|
|
1701
|
|
|
# 9.3.2 |
1702
|
|
|
'br' => array_merge( $common, [ 'clear' ] ), |
1703
|
|
|
|
1704
|
|
|
# https://www.w3.org/TR/html5/text-level-semantics.html#the-wbr-element |
1705
|
|
|
'wbr' => $common, |
1706
|
|
|
|
1707
|
|
|
# 9.3.4 |
1708
|
|
|
'pre' => array_merge( $common, [ 'width' ] ), |
1709
|
|
|
|
1710
|
|
|
# 9.4 |
1711
|
|
|
'ins' => array_merge( $common, [ 'cite', 'datetime' ] ), |
1712
|
|
|
'del' => array_merge( $common, [ 'cite', 'datetime' ] ), |
1713
|
|
|
|
1714
|
|
|
# 10.2 |
1715
|
|
|
'ul' => array_merge( $common, [ 'type' ] ), |
1716
|
|
|
'ol' => array_merge( $common, [ 'type', 'start', 'reversed' ] ), |
1717
|
|
|
'li' => array_merge( $common, [ 'type', 'value' ] ), |
1718
|
|
|
|
1719
|
|
|
# 10.3 |
1720
|
|
|
'dl' => $common, |
1721
|
|
|
'dd' => $common, |
1722
|
|
|
'dt' => $common, |
1723
|
|
|
|
1724
|
|
|
# 11.2.1 |
1725
|
|
|
'table' => array_merge( $common, |
1726
|
|
|
[ 'summary', 'width', 'border', 'frame', |
1727
|
|
|
'rules', 'cellspacing', 'cellpadding', |
1728
|
|
|
'align', 'bgcolor', |
1729
|
|
|
] ), |
1730
|
|
|
|
1731
|
|
|
# 11.2.2 |
1732
|
|
|
'caption' => $block, |
1733
|
|
|
|
1734
|
|
|
# 11.2.3 |
1735
|
|
|
'thead' => $common, |
1736
|
|
|
'tfoot' => $common, |
1737
|
|
|
'tbody' => $common, |
1738
|
|
|
|
1739
|
|
|
# 11.2.4 |
1740
|
|
|
'colgroup' => array_merge( $common, [ 'span' ] ), |
1741
|
|
|
'col' => array_merge( $common, [ 'span' ] ), |
1742
|
|
|
|
1743
|
|
|
# 11.2.5 |
1744
|
|
|
'tr' => array_merge( $common, [ 'bgcolor' ], $tablealign ), |
1745
|
|
|
|
1746
|
|
|
# 11.2.6 |
1747
|
|
|
'td' => array_merge( $common, $tablecell, $tablealign ), |
1748
|
|
|
'th' => array_merge( $common, $tablecell, $tablealign ), |
1749
|
|
|
|
1750
|
|
|
# 12.2 |
1751
|
|
|
# NOTE: <a> is not allowed directly, but the attrib |
1752
|
|
|
# whitelist is used from the Parser object |
1753
|
|
|
'a' => array_merge( $common, [ 'href', 'rel', 'rev' ] ), # rel/rev esp. for RDFa |
1754
|
|
|
|
1755
|
|
|
# 13.2 |
1756
|
|
|
# Not usually allowed, but may be used for extension-style hooks |
1757
|
|
|
# such as <math> when it is rasterized, or if $wgAllowImageTag is |
1758
|
|
|
# true |
1759
|
|
|
'img' => array_merge( $common, [ 'alt', 'src', 'width', 'height' ] ), |
1760
|
|
|
|
1761
|
|
|
# 15.2.1 |
1762
|
|
|
'tt' => $common, |
1763
|
|
|
'b' => $common, |
1764
|
|
|
'i' => $common, |
1765
|
|
|
'big' => $common, |
1766
|
|
|
'small' => $common, |
1767
|
|
|
'strike' => $common, |
1768
|
|
|
's' => $common, |
1769
|
|
|
'u' => $common, |
1770
|
|
|
|
1771
|
|
|
# 15.2.2 |
1772
|
|
|
'font' => array_merge( $common, [ 'size', 'color', 'face' ] ), |
1773
|
|
|
# basefont |
1774
|
|
|
|
1775
|
|
|
# 15.3 |
1776
|
|
|
'hr' => array_merge( $common, [ 'width' ] ), |
1777
|
|
|
|
1778
|
|
|
# HTML Ruby annotation text module, simple ruby only. |
1779
|
|
|
# https://www.w3.org/TR/html5/text-level-semantics.html#the-ruby-element |
1780
|
|
|
'ruby' => $common, |
1781
|
|
|
# rbc |
1782
|
|
|
'rb' => $common, |
1783
|
|
|
'rp' => $common, |
1784
|
|
|
'rt' => $common, # array_merge( $common, array( 'rbspan' ) ), |
1785
|
|
|
'rtc' => $common, |
1786
|
|
|
|
1787
|
|
|
# MathML root element, where used for extensions |
1788
|
|
|
# 'title' may not be 100% valid here; it's XHTML |
1789
|
|
|
# https://www.w3.org/TR/REC-MathML/ |
1790
|
|
|
'math' => [ 'class', 'style', 'id', 'title' ], |
1791
|
|
|
|
1792
|
|
|
# HTML 5 section 4.6 |
1793
|
|
|
'bdi' => $common, |
1794
|
|
|
|
1795
|
|
|
# HTML5 elements, defined by: |
1796
|
|
|
# https://html.spec.whatwg.org/multipage/semantics.html#the-data-element |
1797
|
|
|
'data' => array_merge( $common, [ 'value' ] ), |
1798
|
|
|
'time' => array_merge( $common, [ 'datetime' ] ), |
1799
|
|
|
'mark' => $common, |
1800
|
|
|
|
1801
|
|
|
// meta and link are only permitted by removeHTMLtags when Microdata |
1802
|
|
|
// is enabled so we don't bother adding a conditional to hide these |
1803
|
|
|
// Also meta and link are only valid in WikiText as Microdata elements |
1804
|
|
|
// (ie: validateTag rejects tags missing the attributes needed for Microdata) |
1805
|
|
|
// So we don't bother including $common attributes that have no purpose. |
1806
|
|
|
'meta' => [ 'itemprop', 'content' ], |
1807
|
|
|
'link' => [ 'itemprop', 'href' ], |
1808
|
|
|
]; |
1809
|
|
|
|
1810
|
|
|
return $whitelist; |
1811
|
|
|
} |
1812
|
|
|
|
1813
|
|
|
/** |
1814
|
|
|
* Take a fragment of (potentially invalid) HTML and return |
1815
|
|
|
* a version with any tags removed, encoded as plain text. |
1816
|
|
|
* |
1817
|
|
|
* Warning: this return value must be further escaped for literal |
1818
|
|
|
* inclusion in HTML output as of 1.10! |
1819
|
|
|
* |
1820
|
|
|
* @param string $text HTML fragment |
1821
|
|
|
* @return string |
1822
|
|
|
*/ |
1823
|
|
|
static function stripAllTags( $text ) { |
1824
|
|
|
# Actual <tags> |
1825
|
|
|
$text = StringUtils::delimiterReplace( '<', '>', '', $text ); |
1826
|
|
|
|
1827
|
|
|
# Normalize &entities and whitespace |
1828
|
|
|
$text = self::decodeCharReferences( $text ); |
1829
|
|
|
$text = self::normalizeWhitespace( $text ); |
1830
|
|
|
|
1831
|
|
|
return $text; |
1832
|
|
|
} |
1833
|
|
|
|
1834
|
|
|
/** |
1835
|
|
|
* Hack up a private DOCTYPE with HTML's standard entity declarations. |
1836
|
|
|
* PHP 4 seemed to know these if you gave it an HTML doctype, but |
1837
|
|
|
* PHP 5.1 doesn't. |
1838
|
|
|
* |
1839
|
|
|
* Use for passing XHTML fragments to PHP's XML parsing functions |
1840
|
|
|
* |
1841
|
|
|
* @return string |
1842
|
|
|
*/ |
1843
|
|
|
static function hackDocType() { |
1844
|
|
|
$out = "<!DOCTYPE html [\n"; |
1845
|
|
|
foreach ( self::$htmlEntities as $entity => $codepoint ) { |
1846
|
|
|
$out .= "<!ENTITY $entity \"&#$codepoint;\">"; |
1847
|
|
|
} |
1848
|
|
|
$out .= "]>\n"; |
1849
|
|
|
return $out; |
1850
|
|
|
} |
1851
|
|
|
|
1852
|
|
|
/** |
1853
|
|
|
* @param string $url |
1854
|
|
|
* @return mixed|string |
1855
|
|
|
*/ |
1856
|
|
|
static function cleanUrl( $url ) { |
1857
|
|
|
# Normalize any HTML entities in input. They will be |
1858
|
|
|
# re-escaped by makeExternalLink(). |
1859
|
|
|
$url = Sanitizer::decodeCharReferences( $url ); |
1860
|
|
|
|
1861
|
|
|
# Escape any control characters introduced by the above step |
1862
|
|
|
$url = preg_replace_callback( '/[\][<>"\\x00-\\x20\\x7F\|]/', |
1863
|
|
|
[ __CLASS__, 'cleanUrlCallback' ], $url ); |
1864
|
|
|
|
1865
|
|
|
# Validate hostname portion |
1866
|
|
|
$matches = []; |
1867
|
|
|
if ( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) { |
1868
|
|
|
list( /* $whole */, $protocol, $host, $rest ) = $matches; |
1869
|
|
|
|
1870
|
|
|
// Characters that will be ignored in IDNs. |
1871
|
|
|
// https://tools.ietf.org/html/rfc3454#section-3.1 |
1872
|
|
|
// Strip them before further processing so blacklists and such work. |
1873
|
|
|
$strip = "/ |
1874
|
|
|
\\s| # general whitespace |
1875
|
|
|
\xc2\xad| # 00ad SOFT HYPHEN |
1876
|
|
|
\xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN |
1877
|
|
|
\xe2\x80\x8b| # 200b ZERO WIDTH SPACE |
1878
|
|
|
\xe2\x81\xa0| # 2060 WORD JOINER |
1879
|
|
|
\xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE |
1880
|
|
|
\xcd\x8f| # 034f COMBINING GRAPHEME JOINER |
1881
|
|
|
\xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE |
1882
|
|
|
\xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO |
1883
|
|
|
\xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE |
1884
|
|
|
\xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER |
1885
|
|
|
\xe2\x80\x8d| # 200d ZERO WIDTH JOINER |
1886
|
|
|
[\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe0f VARIATION SELECTOR-1-16 |
1887
|
|
|
/xuD"; |
1888
|
|
|
|
1889
|
|
|
$host = preg_replace( $strip, '', $host ); |
1890
|
|
|
|
1891
|
|
|
// IPv6 host names are bracketed with []. Url-decode these. |
1892
|
|
|
if ( substr_compare( "//%5B", $host, 0, 5 ) === 0 && |
1893
|
|
|
preg_match( '!^//%5B([0-9A-Fa-f:.]+)%5D((:\d+)?)$!', $host, $matches ) |
1894
|
|
|
) { |
1895
|
|
|
$host = '//[' . $matches[1] . ']' . $matches[2]; |
1896
|
|
|
} |
1897
|
|
|
|
1898
|
|
|
// @todo FIXME: Validate hostnames here |
1899
|
|
|
|
1900
|
|
|
return $protocol . $host . $rest; |
1901
|
|
|
} else { |
1902
|
|
|
return $url; |
1903
|
|
|
} |
1904
|
|
|
} |
1905
|
|
|
|
1906
|
|
|
/** |
1907
|
|
|
* @param array $matches |
1908
|
|
|
* @return string |
1909
|
|
|
*/ |
1910
|
|
|
static function cleanUrlCallback( $matches ) { |
1911
|
|
|
return urlencode( $matches[0] ); |
1912
|
|
|
} |
1913
|
|
|
|
1914
|
|
|
/** |
1915
|
|
|
* Does a string look like an e-mail address? |
1916
|
|
|
* |
1917
|
|
|
* This validates an email address using an HTML5 specification found at: |
1918
|
|
|
* http://www.whatwg.org/html/states-of-the-type-attribute.html#valid-e-mail-address |
1919
|
|
|
* Which as of 2011-01-24 says: |
1920
|
|
|
* |
1921
|
|
|
* A valid e-mail address is a string that matches the ABNF production |
1922
|
|
|
* 1*( atext / "." ) "@" ldh-str *( "." ldh-str ) where atext is defined |
1923
|
|
|
* in RFC 5322 section 3.2.3, and ldh-str is defined in RFC 1034 section |
1924
|
|
|
* 3.5. |
1925
|
|
|
* |
1926
|
|
|
* This function is an implementation of the specification as requested in |
1927
|
|
|
* bug 22449. |
1928
|
|
|
* |
1929
|
|
|
* Client-side forms will use the same standard validation rules via JS or |
1930
|
|
|
* HTML 5 validation; additional restrictions can be enforced server-side |
1931
|
|
|
* by extensions via the 'isValidEmailAddr' hook. |
1932
|
|
|
* |
1933
|
|
|
* Note that this validation doesn't 100% match RFC 2822, but is believed |
1934
|
|
|
* to be liberal enough for wide use. Some invalid addresses will still |
1935
|
|
|
* pass validation here. |
1936
|
|
|
* |
1937
|
|
|
* @since 1.18 |
1938
|
|
|
* |
1939
|
|
|
* @param string $addr E-mail address |
1940
|
|
|
* @return bool |
1941
|
|
|
*/ |
1942
|
|
|
public static function validateEmail( $addr ) { |
1943
|
|
|
$result = null; |
1944
|
|
|
if ( !Hooks::run( 'isValidEmailAddr', [ $addr, &$result ] ) ) { |
1945
|
|
|
return $result; |
1946
|
|
|
} |
1947
|
|
|
|
1948
|
|
|
// Please note strings below are enclosed in brackets [], this make the |
1949
|
|
|
// hyphen "-" a range indicator. Hence it is double backslashed below. |
1950
|
|
|
// See bug 26948 |
1951
|
|
|
$rfc5322_atext = "a-z0-9!#$%&'*+\\-\/=?^_`{|}~"; |
1952
|
|
|
$rfc1034_ldh_str = "a-z0-9\\-"; |
1953
|
|
|
|
1954
|
|
|
$html5_email_regexp = "/ |
1955
|
|
|
^ # start of string |
1956
|
|
|
[$rfc5322_atext\\.]+ # user part which is liberal :p |
1957
|
|
|
@ # 'apostrophe' |
1958
|
|
|
[$rfc1034_ldh_str]+ # First domain part |
1959
|
|
|
(\\.[$rfc1034_ldh_str]+)* # Following part prefixed with a dot |
1960
|
|
|
$ # End of string |
1961
|
|
|
/ix"; // case Insensitive, eXtended |
1962
|
|
|
|
1963
|
|
|
return (bool)preg_match( $html5_email_regexp, $addr ); |
1964
|
|
|
} |
1965
|
|
|
} |
1966
|
|
|
|