1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* htmlfilter.inc |
4
|
|
|
* --------------- |
5
|
|
|
* This set of functions allows you to filter html in order to remove |
6
|
|
|
* any malicious tags from it. Useful in cases when you need to filter |
7
|
|
|
* user input for any cross-site-scripting attempts. |
8
|
|
|
* |
9
|
|
|
* Copyright (C) 2002-2004 by Duke University |
10
|
|
|
* |
11
|
|
|
* This library is free software; you can redistribute it and/or |
12
|
|
|
* modify it under the terms of the GNU Lesser General Public |
13
|
|
|
* License as published by the Free Software Foundation; either |
14
|
|
|
* version 2.1 of the License, or (at your option) any later version. |
15
|
|
|
* |
16
|
|
|
* This library is distributed in the hope that it will be useful, |
17
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
18
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
19
|
|
|
* Lesser General Public License for more details. |
20
|
|
|
* |
21
|
|
|
* You should have received a copy of the GNU Lesser General Public |
22
|
|
|
* License along with this library; if not, write to the Free Software |
23
|
|
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
24
|
|
|
* 02110-1301 USA |
25
|
|
|
* |
26
|
|
|
* @Author Konstantin Riabitsev <[email protected]> |
27
|
|
|
* @Author Jim Jagielski <[email protected] / [email protected]> |
28
|
|
|
* @Version 1.1 ($Date$) |
29
|
|
|
* @param mixed $tagname |
30
|
|
|
* @param mixed $attary |
31
|
|
|
* @param mixed $tagtype |
32
|
|
|
*/ |
33
|
|
|
|
34
|
|
|
/** |
35
|
|
|
* This function returns the final tag out of the tag name, an array |
36
|
|
|
* of attributes, and the type of the tag. This function is called by |
37
|
|
|
* tln_sanitize internally. |
38
|
|
|
* |
39
|
|
|
* @param string $tagname the name of the tag. |
40
|
|
|
* @param array $attary the array of attributes and their values |
41
|
|
|
* @param int $tagtype The type of the tag (see in comments). |
42
|
|
|
* @return string A string with the final tag representation. |
43
|
|
|
*/ |
44
|
|
|
function tln_tagprint($tagname, $attary, $tagtype) |
45
|
|
|
{ |
46
|
|
|
if (2 == $tagtype) { |
47
|
|
|
$fulltag = '</' . $tagname . '>'; |
48
|
|
|
} else { |
49
|
|
|
$fulltag = '<' . $tagname; |
50
|
|
|
if (is_array($attary) && count($attary)) { |
51
|
|
|
$atts = []; |
52
|
|
|
foreach ($attary as $attname => $attvalue) { |
53
|
|
|
array_push($atts, "$attname=$attvalue"); |
54
|
|
|
} |
55
|
|
|
$fulltag .= ' ' . implode(' ', $atts); |
56
|
|
|
} |
57
|
|
|
if (3 == $tagtype) { |
58
|
|
|
$fulltag .= ' /'; |
59
|
|
|
} |
60
|
|
|
$fulltag .= '>'; |
61
|
|
|
} |
62
|
|
|
|
63
|
|
|
return $fulltag; |
64
|
|
|
} |
65
|
|
|
|
66
|
|
|
/** |
67
|
|
|
* A small helper function to use with array_walk. Modifies a by-ref |
68
|
|
|
* value and makes it lowercase. |
69
|
|
|
* |
70
|
|
|
* @param string $val a value passed by-ref. |
71
|
|
|
*/ |
72
|
|
|
function tln_casenormalize(&$val) |
73
|
|
|
{ |
74
|
|
|
$val = mb_strtolower($val); |
75
|
|
|
} |
76
|
|
|
|
77
|
|
|
/** |
78
|
|
|
* This function skips any whitespace from the current position within |
79
|
|
|
* a string and to the next non-whitespace value. |
80
|
|
|
* |
81
|
|
|
* @param string $body the string |
82
|
|
|
* @param int $offset the offset within the string where we should start |
83
|
|
|
* looking for the next non-whitespace character. |
84
|
|
|
* @return int the location within the $body where the next |
85
|
|
|
* non-whitespace char is located. |
86
|
|
|
*/ |
87
|
|
|
function tln_skipspace($body, $offset) |
88
|
|
|
{ |
89
|
|
|
preg_match('/^(\s*)/s', mb_substr($body, $offset), $matches); |
90
|
|
|
if (count($matches[1])) { |
91
|
|
|
$count = mb_strlen($matches[1]); |
92
|
|
|
$offset += $count; |
93
|
|
|
} |
94
|
|
|
|
95
|
|
|
return $offset; |
96
|
|
|
} |
97
|
|
|
|
98
|
|
|
/** |
99
|
|
|
* This function looks for the next character within a string. It's |
100
|
|
|
* really just a glorified "strpos", except it catches the failures |
101
|
|
|
* nicely. |
102
|
|
|
* |
103
|
|
|
* @param string $body The string to look for needle in. |
104
|
|
|
* @param int $offset Start looking from this position. |
105
|
|
|
* @param string $needle The character/string to look for. |
106
|
|
|
* @return int location of the next occurrence of the needle, or |
107
|
|
|
* strlen($body) if needle wasn't found. |
108
|
|
|
*/ |
109
|
|
|
function tln_findnxstr($body, $offset, $needle) |
110
|
|
|
{ |
111
|
|
|
$pos = mb_strpos($body, $needle, $offset); |
112
|
|
|
if (false === $pos) { |
113
|
|
|
$pos = mb_strlen($body); |
114
|
|
|
} |
115
|
|
|
|
116
|
|
|
return $pos; |
117
|
|
|
} |
118
|
|
|
|
119
|
|
|
/** |
120
|
|
|
* This function takes a PCRE-style regexp and tries to match it |
121
|
|
|
* within the string. |
122
|
|
|
* |
123
|
|
|
* @param string $body The string to look for needle in. |
124
|
|
|
* @param int $offset Start looking from here. |
125
|
|
|
* @param string $reg A PCRE-style regex to match. |
126
|
|
|
* @return array|bool Returns a false if no matches found, or an array |
127
|
|
|
* with the following members: |
128
|
|
|
* - integer with the location of the match within $body |
129
|
|
|
* - string with whatever content between offset and the match |
130
|
|
|
* - string with whatever it is we matched |
131
|
|
|
*/ |
132
|
|
|
function tln_findnxreg($body, $offset, $reg) |
133
|
|
|
{ |
134
|
|
|
$matches = []; |
135
|
|
|
$retarr = []; |
136
|
|
|
$preg_rule = '%^(.*?)(' . $reg . ')%s'; |
137
|
|
|
preg_match($preg_rule, mb_substr($body, $offset), $matches); |
138
|
|
|
if (!isset($matches[0]) || !$matches[0]) { |
139
|
|
|
$retarr = false; |
140
|
|
|
} else { |
141
|
|
|
$retarr[0] = $offset + mb_strlen($matches[1]); |
142
|
|
|
$retarr[1] = $matches[1]; |
143
|
|
|
$retarr[2] = $matches[2]; |
144
|
|
|
} |
145
|
|
|
|
146
|
|
|
return $retarr; |
147
|
|
|
} |
148
|
|
|
|
149
|
|
|
/** |
150
|
|
|
* This function looks for the next tag. |
151
|
|
|
* |
152
|
|
|
* @param string $body String where to look for the next tag. |
153
|
|
|
* @param int $offset Start looking from here. |
154
|
|
|
* @return array|bool false if no more tags exist in the body, or |
155
|
|
|
* an array with the following members: |
156
|
|
|
* - string with the name of the tag |
157
|
|
|
* - array with attributes and their values |
158
|
|
|
* - integer with tag type (1, 2, or 3) |
159
|
|
|
* - integer where the tag starts (starting "<") |
160
|
|
|
* - integer where the tag ends (ending ">") |
161
|
|
|
* first three members will be false, if the tag is invalid. |
162
|
|
|
*/ |
163
|
|
|
function tln_getnxtag($body, $offset) |
164
|
|
|
{ |
165
|
|
|
if ($offset > mb_strlen($body)) { |
166
|
|
|
return false; |
167
|
|
|
} |
168
|
|
|
$lt = tln_findnxstr($body, $offset, '<'); |
169
|
|
|
if ($lt == mb_strlen($body)) { |
170
|
|
|
return false; |
171
|
|
|
} |
172
|
|
|
/** |
173
|
|
|
* We are here: |
174
|
|
|
* blah blah <tag attribute="value"> |
175
|
|
|
* \---------^ |
176
|
|
|
*/ |
177
|
|
|
$pos = tln_skipspace($body, $lt + 1); |
178
|
|
View Code Duplication |
if ($pos >= mb_strlen($body)) { |
|
|
|
|
179
|
|
|
return [false, false, false, $lt, mb_strlen($body)]; |
180
|
|
|
} |
181
|
|
|
/** |
182
|
|
|
* There are 3 kinds of tags: |
183
|
|
|
* 1. Opening tag, e.g.: |
184
|
|
|
* <a href="blah"> |
185
|
|
|
* 2. Closing tag, e.g.: |
186
|
|
|
* </a> |
187
|
|
|
* 3. XHTML-style content-less tag, e.g.: |
188
|
|
|
* <img src="blah"/> |
189
|
|
|
*/ |
190
|
|
|
switch (mb_substr($body, $pos, 1)) { |
191
|
|
|
case '/': |
192
|
|
|
$tagtype = 2; |
193
|
|
|
$pos++; |
194
|
|
|
break; |
195
|
|
|
case '!': |
196
|
|
|
/** |
197
|
|
|
* A comment or an SGML declaration. |
198
|
|
|
*/ |
199
|
|
|
if ('--' == mb_substr($body, $pos + 1, 2)) { |
200
|
|
|
$gt = mb_strpos($body, '-->', $pos); |
201
|
|
|
if (false === $gt) { |
202
|
|
|
$gt = mb_strlen($body); |
203
|
|
|
} else { |
204
|
|
|
$gt += 2; |
205
|
|
|
} |
206
|
|
|
|
207
|
|
|
return [false, false, false, $lt, $gt]; |
208
|
|
|
} |
209
|
|
|
$gt = tln_findnxstr($body, $pos, '>'); |
210
|
|
|
|
211
|
|
|
return [false, false, false, $lt, $gt]; |
212
|
|
|
break; |
|
|
|
|
213
|
|
|
default: |
214
|
|
|
/** |
215
|
|
|
* Assume tagtype 1 for now. If it's type 3, we'll switch values |
216
|
|
|
* later. |
217
|
|
|
*/ |
218
|
|
|
$tagtype = 1; |
219
|
|
|
break; |
220
|
|
|
} |
221
|
|
|
|
222
|
|
|
/** |
223
|
|
|
* Look for next [\W-_], which will indicate the end of the tag name. |
224
|
|
|
*/ |
225
|
|
|
$regary = tln_findnxreg($body, $pos, '[^\w\-_]'); |
226
|
|
View Code Duplication |
if (false == $regary) { |
|
|
|
|
227
|
|
|
return [false, false, false, $lt, mb_strlen($body)]; |
228
|
|
|
} |
229
|
|
|
list($pos, $tagname, $match) = $regary; |
230
|
|
|
$tagname = mb_strtolower($tagname); |
231
|
|
|
|
232
|
|
|
/** |
233
|
|
|
* $match can be either of these: |
234
|
|
|
* '>' indicating the end of the tag entirely. |
235
|
|
|
* '\s' indicating the end of the tag name. |
236
|
|
|
* '/' indicating that this is type-3 xhtml tag. |
237
|
|
|
* |
238
|
|
|
* Whatever else we find there indicates an invalid tag. |
239
|
|
|
*/ |
240
|
|
|
switch ($match) { |
241
|
|
View Code Duplication |
case '/': |
|
|
|
|
242
|
|
|
/** |
243
|
|
|
* This is an xhtml-style tag with a closing / at the |
244
|
|
|
* end, like so: <img src="blah"/>. Check if it's followed |
245
|
|
|
* by the closing bracket. If not, then this tag is invalid |
246
|
|
|
*/ |
247
|
|
|
if ('/>' == mb_substr($body, $pos, 2)) { |
248
|
|
|
$pos++; |
249
|
|
|
$tagtype = 3; |
250
|
|
|
} else { |
251
|
|
|
$gt = tln_findnxstr($body, $pos, '>'); |
252
|
|
|
$retary = [false, false, false, $lt, $gt]; |
253
|
|
|
|
254
|
|
|
return $retary; |
255
|
|
|
} |
256
|
|
|
//intentional fall-through |
257
|
|
|
// no break |
258
|
|
|
case '>': |
259
|
|
|
return [$tagname, false, $tagtype, $lt, $pos]; |
260
|
|
|
break; |
|
|
|
|
261
|
|
|
default: |
262
|
|
|
/** |
263
|
|
|
* Check if it's whitespace |
264
|
|
|
*/ |
265
|
|
|
if (!preg_match('/\s/', $match)) { |
266
|
|
|
/** |
267
|
|
|
* This is an invalid tag! Look for the next closing ">". |
268
|
|
|
*/ |
269
|
|
|
$gt = tln_findnxstr($body, $lt, '>'); |
270
|
|
|
|
271
|
|
|
return [false, false, false, $lt, $gt]; |
272
|
|
|
} |
273
|
|
|
break; |
274
|
|
|
} |
275
|
|
|
|
276
|
|
|
/** |
277
|
|
|
* At this point we're here: |
278
|
|
|
* <tagname attribute='blah'> |
279
|
|
|
* \-------^ |
280
|
|
|
* |
281
|
|
|
* At this point we loop in order to find all attributes. |
282
|
|
|
*/ |
283
|
|
|
$attary = []; |
284
|
|
|
|
285
|
|
|
while ($pos <= mb_strlen($body)) { |
286
|
|
|
$pos = tln_skipspace($body, $pos); |
287
|
|
View Code Duplication |
if ($pos == mb_strlen($body)) { |
|
|
|
|
288
|
|
|
/** |
289
|
|
|
* Non-closed tag. |
290
|
|
|
*/ |
291
|
|
|
return [false, false, false, $lt, $pos]; |
292
|
|
|
} |
293
|
|
|
/** |
294
|
|
|
* See if we arrived at a ">" or "/>", which means that we reached |
295
|
|
|
* the end of the tag. |
296
|
|
|
*/ |
297
|
|
|
$matches = []; |
298
|
|
|
if (preg_match('%^(\s*)(>|/>)%s', mb_substr($body, $pos), $matches)) { |
299
|
|
|
/** |
300
|
|
|
* Yep. So we did. |
301
|
|
|
*/ |
302
|
|
|
$pos += mb_strlen($matches[1]); |
303
|
|
|
if ('/>' == $matches[2]) { |
304
|
|
|
$tagtype = 3; |
305
|
|
|
$pos++; |
306
|
|
|
} |
307
|
|
|
|
308
|
|
|
return [$tagname, $attary, $tagtype, $lt, $pos]; |
309
|
|
|
} |
310
|
|
|
|
311
|
|
|
/** |
312
|
|
|
* There are several types of attributes, with optional |
313
|
|
|
* [:space:] between members. |
314
|
|
|
* Type 1: |
315
|
|
|
* attrname[:space:]=[:space:]'CDATA' |
316
|
|
|
* Type 2: |
317
|
|
|
* attrname[:space:]=[:space:]"CDATA" |
318
|
|
|
* Type 3: |
319
|
|
|
* attr[:space:]=[:space:]CDATA |
320
|
|
|
* Type 4: |
321
|
|
|
* attrname |
322
|
|
|
* |
323
|
|
|
* We leave types 1 and 2 the same, type 3 we check for |
324
|
|
|
* '"' and convert to """ if needed, then wrap in |
325
|
|
|
* double quotes. Type 4 we convert into: |
326
|
|
|
* attrname="yes". |
327
|
|
|
*/ |
328
|
|
|
$regary = tln_findnxreg($body, $pos, '[^\w\-_]'); |
329
|
|
View Code Duplication |
if (false == $regary) { |
|
|
|
|
330
|
|
|
/** |
331
|
|
|
* Looks like body ended before the end of tag. |
332
|
|
|
*/ |
333
|
|
|
return [false, false, false, $lt, mb_strlen($body)]; |
334
|
|
|
} |
335
|
|
|
list($pos, $attname, $match) = $regary; |
336
|
|
|
$attname = mb_strtolower($attname); |
337
|
|
|
/** |
338
|
|
|
* We arrived at the end of attribute name. Several things possible |
339
|
|
|
* here: |
340
|
|
|
* '>' means the end of the tag and this is attribute type 4 |
341
|
|
|
* '/' if followed by '>' means the same thing as above |
342
|
|
|
* '\s' means a lot of things -- look what it's followed by. |
343
|
|
|
* anything else means the attribute is invalid. |
344
|
|
|
*/ |
345
|
|
|
switch ($match) { |
346
|
|
View Code Duplication |
case '/': |
|
|
|
|
347
|
|
|
/** |
348
|
|
|
* This is an xhtml-style tag with a closing / at the |
349
|
|
|
* end, like so: <img src="blah"/>. Check if it's followed |
350
|
|
|
* by the closing bracket. If not, then this tag is invalid |
351
|
|
|
*/ |
352
|
|
|
if ('/>' == mb_substr($body, $pos, 2)) { |
353
|
|
|
$pos++; |
354
|
|
|
$tagtype = 3; |
355
|
|
|
} else { |
356
|
|
|
$gt = tln_findnxstr($body, $pos, '>'); |
357
|
|
|
$retary = [false, false, false, $lt, $gt]; |
358
|
|
|
|
359
|
|
|
return $retary; |
360
|
|
|
} |
361
|
|
|
//intentional fall-through |
362
|
|
|
// no break |
363
|
|
|
case '>': |
364
|
|
|
$attary[$attname] = '"yes"'; |
365
|
|
|
|
366
|
|
|
return [$tagname, $attary, $tagtype, $lt, $pos]; |
367
|
|
|
break; |
|
|
|
|
368
|
|
|
default: |
369
|
|
|
/** |
370
|
|
|
* Skip whitespace and see what we arrive at. |
371
|
|
|
*/ |
372
|
|
|
$pos = tln_skipspace($body, $pos); |
373
|
|
|
$char = mb_substr($body, $pos, 1); |
374
|
|
|
/** |
375
|
|
|
* Two things are valid here: |
376
|
|
|
* '=' means this is attribute type 1 2 or 3. |
377
|
|
|
* \w means this was attribute type 4. |
378
|
|
|
* anything else we ignore and re-loop. End of tag and |
379
|
|
|
* invalid stuff will be caught by our checks at the beginning |
380
|
|
|
* of the loop. |
381
|
|
|
*/ |
382
|
|
|
if ('=' == $char) { |
383
|
|
|
$pos++; |
384
|
|
|
$pos = tln_skipspace($body, $pos); |
385
|
|
|
/** |
386
|
|
|
* Here are 3 possibilities: |
387
|
|
|
* "'" attribute type 1 |
388
|
|
|
* '"' attribute type 2 |
389
|
|
|
* everything else is the content of tag type 3 |
390
|
|
|
*/ |
391
|
|
|
$quot = mb_substr($body, $pos, 1); |
392
|
|
|
if ('\'' == $quot) { |
393
|
|
|
$regary = tln_findnxreg($body, $pos + 1, '\''); |
394
|
|
View Code Duplication |
if (false == $regary) { |
|
|
|
|
395
|
|
|
return [false, false, false, $lt, mb_strlen($body)]; |
396
|
|
|
} |
397
|
|
|
list($pos, $attval, $match) = $regary; |
398
|
|
|
$pos++; |
399
|
|
|
$attary[$attname] = '\'' . $attval . '\''; |
400
|
|
|
} elseif ('"' == $quot) { |
401
|
|
|
$regary = tln_findnxreg($body, $pos + 1, '\"'); |
402
|
|
View Code Duplication |
if (false == $regary) { |
|
|
|
|
403
|
|
|
return [false, false, false, $lt, mb_strlen($body)]; |
404
|
|
|
} |
405
|
|
|
list($pos, $attval, $match) = $regary; |
406
|
|
|
$pos++; |
407
|
|
|
$attary[$attname] = '"' . $attval . '"'; |
408
|
|
|
} else { |
409
|
|
|
/** |
410
|
|
|
* These are hateful. Look for \s, or >. |
411
|
|
|
*/ |
412
|
|
|
$regary = tln_findnxreg($body, $pos, '[\s>]'); |
413
|
|
View Code Duplication |
if (false == $regary) { |
|
|
|
|
414
|
|
|
return [false, false, false, $lt, mb_strlen($body)]; |
415
|
|
|
} |
416
|
|
|
list($pos, $attval, $match) = $regary; |
417
|
|
|
/** |
418
|
|
|
* If it's ">" it will be caught at the top. |
419
|
|
|
*/ |
420
|
|
|
$attval = preg_replace('/\"/s', '"', $attval); |
421
|
|
|
$attary[$attname] = '"' . $attval . '"'; |
422
|
|
|
} |
423
|
|
|
} elseif (preg_match('|[\w/>]|', $char)) { |
424
|
|
|
/** |
425
|
|
|
* That was attribute type 4. |
426
|
|
|
*/ |
427
|
|
|
$attary[$attname] = '"yes"'; |
428
|
|
|
} else { |
429
|
|
|
/** |
430
|
|
|
* An illegal character. Find next '>' and return. |
431
|
|
|
*/ |
432
|
|
|
$gt = tln_findnxstr($body, $pos, '>'); |
433
|
|
|
|
434
|
|
|
return [false, false, false, $lt, $gt]; |
435
|
|
|
} |
436
|
|
|
break; |
437
|
|
|
} |
438
|
|
|
} |
439
|
|
|
/** |
440
|
|
|
* The fact that we got here indicates that the tag end was never |
441
|
|
|
* found. Return invalid tag indication so it gets stripped. |
442
|
|
|
*/ |
443
|
|
|
return [false, false, false, $lt, mb_strlen($body)]; |
444
|
|
|
} |
445
|
|
|
|
446
|
|
|
/** |
447
|
|
|
* Translates entities into literal values so they can be checked. |
448
|
|
|
* |
449
|
|
|
* @param string $attvalue the by-ref value to check. |
450
|
|
|
* @param string $regex the regular expression to check against. |
451
|
|
|
* @param bool $hex whether the entities are hexadecimal. |
452
|
|
|
* @return bool True or False depending on whether there were matches. |
453
|
|
|
*/ |
454
|
|
|
function tln_deent(&$attvalue, $regex, $hex = false) |
455
|
|
|
{ |
456
|
|
|
preg_match_all($regex, $attvalue, $matches); |
457
|
|
|
if (is_array($matches) && count($matches[0]) > 0) { |
458
|
|
|
$repl = []; |
459
|
|
|
for ($i = 0; $i < count($matches[0]); $i++) { |
|
|
|
|
460
|
|
|
$numval = $matches[1][$i]; |
461
|
|
|
if ($hex) { |
462
|
|
|
$numval = hexdec($numval); |
463
|
|
|
} |
464
|
|
|
$repl[$matches[0][$i]] = chr($numval); |
465
|
|
|
} |
466
|
|
|
$attvalue = strtr($attvalue, $repl); |
467
|
|
|
|
468
|
|
|
return true; |
469
|
|
|
} |
470
|
|
|
|
471
|
|
|
return false; |
472
|
|
|
} |
473
|
|
|
|
474
|
|
|
/** |
475
|
|
|
* This function checks attribute values for entity-encoded values |
476
|
|
|
* and returns them translated into 8-bit strings so we can run |
477
|
|
|
* checks on them. |
478
|
|
|
* |
479
|
|
|
* @param string $attvalue A string to run entity check against. |
480
|
|
|
*/ |
481
|
|
|
function tln_defang(&$attvalue) |
482
|
|
|
{ |
483
|
|
|
/** |
484
|
|
|
* Skip this if there aren't ampersands or backslashes. |
485
|
|
|
*/ |
486
|
|
|
if (false === mb_strpos($attvalue, '&') |
487
|
|
|
&& false === mb_strpos($attvalue, '\\')) { |
488
|
|
|
return; |
489
|
|
|
} |
490
|
|
|
do { |
491
|
|
|
$m = false; |
492
|
|
|
$m = $m || tln_deent($attvalue, '/\�*(\d+);*/s'); |
493
|
|
|
$m = $m || tln_deent($attvalue, '/\�*((\d|[a-f])+);*/si', true); |
494
|
|
|
$m = $m || tln_deent($attvalue, '/\\\\(\d+)/s', true); |
495
|
|
|
} while (true == $m); |
|
|
|
|
496
|
|
|
$attvalue = stripslashes($attvalue); |
497
|
|
|
} |
498
|
|
|
|
499
|
|
|
/** |
500
|
|
|
* Kill any tabs, newlines, or carriage returns. Our friends the |
501
|
|
|
* makers of the browser with 95% market value decided that it'd |
502
|
|
|
* be funny to make "java[tab]script" be just as good as "javascript". |
503
|
|
|
* |
504
|
|
|
* @param string $attvalue The attribute value before extraneous spaces removed. |
505
|
|
|
*/ |
506
|
|
|
function tln_unspace(&$attvalue) |
507
|
|
|
{ |
508
|
|
|
if (strcspn($attvalue, "\t\r\n\0 ") != mb_strlen($attvalue)) { |
509
|
|
|
$attvalue = str_replace(["\t", "\r", "\n", "\0", ' '], ['', '', '', '', ''], $attvalue); |
510
|
|
|
} |
511
|
|
|
} |
512
|
|
|
|
513
|
|
|
/** |
514
|
|
|
* This function runs various checks against the attributes. |
515
|
|
|
* |
516
|
|
|
* @param string $tagname String with the name of the tag. |
517
|
|
|
* @param array $attary Array with all tag attributes. |
518
|
|
|
* @param array $rm_attnames See description for tln_sanitize |
519
|
|
|
* @param array $bad_attvals See description for tln_sanitize |
520
|
|
|
* @param array $add_attr_to_tag See description for tln_sanitize |
521
|
|
|
* @param string $trans_image_path |
522
|
|
|
* @param bool $block_external_images |
523
|
|
|
* @return array with modified attributes. |
524
|
|
|
*/ |
525
|
|
|
function tln_fixatts( |
526
|
|
|
$tagname, |
527
|
|
|
$attary, |
528
|
|
|
$rm_attnames, |
529
|
|
|
$bad_attvals, |
530
|
|
|
$add_attr_to_tag, |
531
|
|
|
$trans_image_path, |
532
|
|
|
$block_external_images) |
533
|
|
|
{ |
534
|
|
|
foreach ($attary as $attname => $attvalue) { |
535
|
|
|
/** |
536
|
|
|
* See if this attribute should be removed. |
537
|
|
|
*/ |
538
|
|
|
foreach ($rm_attnames as $matchtag => $matchattrs) { |
539
|
|
|
if (preg_match($matchtag, $tagname)) { |
540
|
|
|
foreach ($matchattrs as $matchattr) { |
541
|
|
|
if (preg_match($matchattr, $attname)) { |
542
|
|
|
unset($attary[$attname]); |
543
|
|
|
continue; |
544
|
|
|
} |
545
|
|
|
} |
546
|
|
|
} |
547
|
|
|
} |
548
|
|
|
/** |
549
|
|
|
* Remove any backslashes, entities, or extraneous whitespace. |
550
|
|
|
*/ |
551
|
|
|
$oldattvalue = $attvalue; |
552
|
|
|
tln_defang($attvalue); |
553
|
|
|
if ('style' == $attname && $attvalue !== $oldattvalue) { |
554
|
|
|
$attvalue = 'idiocy'; |
555
|
|
|
$attary[$attname] = $attvalue; |
556
|
|
|
} |
557
|
|
|
tln_unspace($attvalue); |
558
|
|
|
|
559
|
|
|
/** |
560
|
|
|
* Now let's run checks on the attvalues. |
561
|
|
|
* I don't expect anyone to comprehend this. If you do, |
562
|
|
|
* get in touch with me so I can drive to where you live and |
563
|
|
|
* shake your hand personally. :) |
564
|
|
|
*/ |
565
|
|
|
foreach ($bad_attvals as $matchtag => $matchattrs) { |
566
|
|
|
if (preg_match($matchtag, $tagname)) { |
567
|
|
|
foreach ($matchattrs as $matchattr => $valary) { |
568
|
|
|
if (preg_match($matchattr, $attname)) { |
569
|
|
|
/** |
570
|
|
|
* There are two arrays in valary. |
571
|
|
|
* First is matches. |
572
|
|
|
* Second one is replacements |
573
|
|
|
*/ |
574
|
|
|
list($valmatch, $valrepl) = $valary; |
575
|
|
|
$newvalue = preg_replace($valmatch, $valrepl, $attvalue); |
576
|
|
|
if ($newvalue != $attvalue) { |
577
|
|
|
$attary[$attname] = $newvalue; |
578
|
|
|
$attvalue = $newvalue; |
579
|
|
|
} |
580
|
|
|
} |
581
|
|
|
} |
582
|
|
|
} |
583
|
|
|
} |
584
|
|
|
if ('style' == $attname) { |
585
|
|
|
if (preg_match('/[\0-\37\200-\377]+/', $attvalue)) { |
586
|
|
|
$attary[$attname] = '"disallowed character"'; |
587
|
|
|
} |
588
|
|
|
preg_match_all("/url\s*\((.+)\)/si", $attvalue, $aMatch); |
589
|
|
|
if (count($aMatch)) { |
590
|
|
View Code Duplication |
foreach ($aMatch[1] as $sMatch) { |
|
|
|
|
591
|
|
|
$urlvalue = $sMatch; |
592
|
|
|
tln_fixurl($attname, $urlvalue, $trans_image_path, $block_external_images); |
593
|
|
|
$attary[$attname] = str_replace($sMatch, $urlvalue, $attvalue); |
594
|
|
|
} |
595
|
|
|
} |
596
|
|
|
} |
597
|
|
|
} |
598
|
|
|
/** |
599
|
|
|
* See if we need to append any attributes to this tag. |
600
|
|
|
*/ |
601
|
|
|
foreach ($add_attr_to_tag as $matchtag => $addattary) { |
602
|
|
|
if (preg_match($matchtag, $tagname)) { |
603
|
|
|
$attary = array_merge($attary, $addattary); |
604
|
|
|
} |
605
|
|
|
} |
606
|
|
|
|
607
|
|
|
return $attary; |
608
|
|
|
} |
609
|
|
|
|
610
|
|
|
function tln_fixurl($attname, &$attvalue, $trans_image_path, $block_external_images) |
611
|
|
|
{ |
612
|
|
|
$sQuote = '"'; |
613
|
|
|
$attvalue = trim($attvalue); |
614
|
|
|
if ($attvalue && ('"' == $attvalue[0] || "'" == $attvalue[0])) { |
615
|
|
|
// remove the double quotes |
616
|
|
|
$sQuote = $attvalue[0]; |
617
|
|
|
$attvalue = trim(mb_substr($attvalue, 1, -1)); |
618
|
|
|
} |
619
|
|
|
|
620
|
|
|
/** |
621
|
|
|
* Replace empty src tags with the blank image. src is only used |
622
|
|
|
* for frames, images, and image inputs. Doing a replace should |
623
|
|
|
* not affect them working as should be, however it will stop |
624
|
|
|
* IE from being kicked off when src for img tags are not set |
625
|
|
|
*/ |
626
|
|
|
if ('' == $attvalue) { |
627
|
|
|
$attvalue = $sQuote . $trans_image_path . $sQuote; |
628
|
|
|
} else { |
629
|
|
|
// first, disallow 8 bit characters and control characters |
630
|
|
|
if (preg_match('/[\0-\37\200-\377]+/', $attvalue)) { |
631
|
|
|
switch ($attname) { |
632
|
|
|
case 'href': |
633
|
|
|
$attvalue = $sQuote . 'http://invalid-stuff-detected.example.com' . $sQuote; |
634
|
|
|
break; |
635
|
|
|
default: |
636
|
|
|
$attvalue = $sQuote . $trans_image_path . $sQuote; |
637
|
|
|
break; |
638
|
|
|
} |
639
|
|
|
} else { |
640
|
|
|
$aUrl = parse_url($attvalue); |
641
|
|
|
if (isset($aUrl['scheme'])) { |
642
|
|
|
switch (mb_strtolower($aUrl['scheme'])) { |
643
|
|
|
case 'mailto': |
644
|
|
|
case 'http': |
645
|
|
|
case 'https': |
646
|
|
|
case 'ftp': |
647
|
|
|
if ('href' != $attname) { |
648
|
|
|
if (true == $block_external_images) { |
649
|
|
|
$attvalue = $sQuote . $trans_image_path . $sQuote; |
650
|
|
|
} else { |
651
|
|
|
if (!isset($aUrl['path'])) { |
652
|
|
|
$attvalue = $sQuote . $trans_image_path . $sQuote; |
653
|
|
|
} |
654
|
|
|
} |
655
|
|
|
} else { |
656
|
|
|
$attvalue = $sQuote . $attvalue . $sQuote; |
657
|
|
|
} |
658
|
|
|
break; |
659
|
|
|
case 'outbind': |
660
|
|
|
$attvalue = $sQuote . $attvalue . $sQuote; |
661
|
|
|
break; |
662
|
|
|
case 'cid': |
663
|
|
|
$attvalue = $sQuote . $attvalue . $sQuote; |
664
|
|
|
break; |
665
|
|
|
default: |
666
|
|
|
$attvalue = $sQuote . $trans_image_path . $sQuote; |
667
|
|
|
break; |
668
|
|
|
} |
669
|
|
|
} else { |
670
|
|
|
if (!isset($aUrl['path']) || $aUrl['path'] != $trans_image_path) { |
671
|
|
|
$$attvalue = $sQuote . $trans_image_path . $sQuote; |
672
|
|
|
} |
673
|
|
|
} |
674
|
|
|
} |
675
|
|
|
} |
676
|
|
|
} |
677
|
|
|
|
678
|
|
|
/** |
679
|
|
|
* @param string $body |
680
|
|
|
* @param int $pos |
681
|
|
|
* @param string $trans_image_path |
682
|
|
|
* @param bool $block_external_images |
683
|
|
|
*/ |
684
|
|
|
function tln_fixstyle($body, $pos, $trans_image_path, $block_external_images) |
685
|
|
|
{ |
686
|
|
|
// workaround for </style> in between comments |
687
|
|
|
$content = ''; |
688
|
|
|
$sToken = ''; |
689
|
|
|
$bSucces = false; |
690
|
|
|
$bEndTag = false; |
691
|
|
|
for ($i = $pos, $iCount = mb_strlen($body); $i < $iCount; ++$i) { |
692
|
|
|
$char = $body[$i]; |
693
|
|
|
switch ($char) { |
694
|
|
|
case '<': |
695
|
|
|
$sToken = $char; |
696
|
|
|
break; |
697
|
|
|
case '/': |
698
|
|
|
if ('<' == $sToken) { |
699
|
|
|
$sToken .= $char; |
700
|
|
|
$bEndTag = true; |
701
|
|
|
} else { |
702
|
|
|
$content .= $char; |
703
|
|
|
} |
704
|
|
|
break; |
705
|
|
|
case '>': |
706
|
|
|
if ($bEndTag) { |
707
|
|
|
$sToken .= $char; |
708
|
|
|
if (preg_match('/\<\/\s*style\s*\>/i', $sToken, $aMatch)) { |
709
|
|
|
$newpos = $i + 1; |
710
|
|
|
$bSucces = true; |
711
|
|
|
break 2; |
712
|
|
|
} |
713
|
|
|
$content .= $sToken; |
714
|
|
|
|
715
|
|
|
$bEndTag = false; |
716
|
|
|
} else { |
717
|
|
|
$content .= $char; |
718
|
|
|
} |
719
|
|
|
break; |
720
|
|
|
case '!': |
721
|
|
|
if ('<' == $sToken) { |
722
|
|
|
// possible comment |
723
|
|
|
if (isset($body[$i + 2]) && '!--' == mb_substr($body, $i, 3)) { |
724
|
|
|
$i = mb_strpos($body, '-->', $i + 3); |
725
|
|
|
if (false === $i) { // no end comment |
726
|
|
|
$i = mb_strlen($body); |
727
|
|
|
} |
728
|
|
|
$sToken = ''; |
729
|
|
|
} |
730
|
|
|
} else { |
731
|
|
|
$content .= $char; |
732
|
|
|
} |
733
|
|
|
break; |
734
|
|
|
default: |
735
|
|
|
if ($bEndTag) { |
736
|
|
|
$sToken .= $char; |
737
|
|
|
} else { |
738
|
|
|
$content .= $char; |
739
|
|
|
} |
740
|
|
|
break; |
741
|
|
|
} |
742
|
|
|
} |
743
|
|
|
if (false == $bSucces) { |
|
|
|
|
744
|
|
|
return [false, mb_strlen($body)]; |
745
|
|
|
} |
746
|
|
|
|
747
|
|
|
/** |
748
|
|
|
* First look for general BODY style declaration, which would be |
749
|
|
|
* like so: |
750
|
|
|
* body {background: blah-blah} |
751
|
|
|
* and change it to .bodyclass so we can just assign it to a <div> |
752
|
|
|
*/ |
753
|
|
|
$content = preg_replace("|body(\s*\{.*?\})|si", '.bodyclass\\1', $content); |
754
|
|
|
|
755
|
|
|
/** |
756
|
|
|
* Fix url('blah') declarations. |
757
|
|
|
*/ |
758
|
|
|
// $content = preg_replace("|url\s*\(\s*([\'\"])\s*\S+script\s*:.*?([\'\"])\s*\)|si", |
759
|
|
|
// "url(\\1$trans_image_path\\2)", $content); |
760
|
|
|
|
761
|
|
|
// first check for 8bit sequences and disallowed control characters |
762
|
|
|
if (preg_match('/[\16-\37\200-\377]+/', $content)) { |
763
|
|
|
$content = '<!-- style block removed by html filter due to presence of 8bit characters -->'; |
764
|
|
|
|
765
|
|
|
return [$content, $newpos]; |
|
|
|
|
766
|
|
|
} |
767
|
|
|
|
768
|
|
|
// remove @import line |
769
|
|
|
$content = preg_replace("/^\s*(@import.*)$/mi", "\n<!-- @import rules forbidden -->\n", $content); |
770
|
|
|
|
771
|
|
|
$content = preg_replace('/(\\\\)?u(\\\\)?r(\\\\)?l(\\\\)?/i', 'url', $content); |
772
|
|
|
preg_match_all("/url\s*\((.+)\)/si", $content, $aMatch); |
773
|
|
|
if (count($aMatch)) { |
774
|
|
|
$aValue = $aReplace = []; |
775
|
|
View Code Duplication |
foreach ($aMatch[1] as $sMatch) { |
|
|
|
|
776
|
|
|
// url value |
777
|
|
|
$urlvalue = $sMatch; |
778
|
|
|
tln_fixurl('style', $urlvalue, $trans_image_path, $block_external_images); |
779
|
|
|
$aValue[] = $sMatch; |
780
|
|
|
$aReplace[] = $urlvalue; |
781
|
|
|
} |
782
|
|
|
$content = str_replace($aValue, $aReplace, $content); |
783
|
|
|
} |
784
|
|
|
|
785
|
|
|
/** |
786
|
|
|
* Remove any backslashes, entities, and extraneous whitespace. |
787
|
|
|
*/ |
788
|
|
|
$contentTemp = $content; |
789
|
|
|
tln_defang($contentTemp); |
790
|
|
|
tln_unspace($contentTemp); |
791
|
|
|
|
792
|
|
|
$match = [ |
793
|
|
|
'/\/\*.*\*\//', |
794
|
|
|
'/expression/i', |
795
|
|
|
'/behaviou*r/i', |
796
|
|
|
'/binding/i', |
797
|
|
|
'/include-source/i', |
798
|
|
|
'/javascript/i', |
799
|
|
|
'/script/i', |
800
|
|
|
'/position/i', |
801
|
|
|
]; |
802
|
|
|
$replace = ['', 'idiocy', 'idiocy', 'idiocy', 'idiocy', 'idiocy', 'idiocy', '']; |
803
|
|
|
$contentNew = preg_replace($match, $replace, $contentTemp); |
804
|
|
|
if ($contentNew !== $contentTemp) { |
805
|
|
|
$content = $contentNew; |
806
|
|
|
} |
807
|
|
|
|
808
|
|
|
return [$content, $newpos]; |
809
|
|
|
} |
810
|
|
|
|
811
|
|
|
/** |
812
|
|
|
* @param string $trans_image_path |
813
|
|
|
*/ |
814
|
|
|
function tln_body2div($attary, $trans_image_path) |
815
|
|
|
{ |
816
|
|
|
$divattary = ['class' => "'bodyclass'"]; |
817
|
|
|
$text = '#000000'; |
818
|
|
|
$has_bgc_stl = $has_txt_stl = false; |
819
|
|
|
$styledef = ''; |
820
|
|
|
if (is_array($attary) && count($attary) > 0) { |
821
|
|
|
foreach ($attary as $attname => $attvalue) { |
822
|
|
|
$quotchar = mb_substr($attvalue, 0, 1); |
823
|
|
|
$attvalue = str_replace($quotchar, '', $attvalue); |
824
|
|
|
switch ($attname) { |
825
|
|
|
case 'background': |
826
|
|
|
$styledef .= "background-image: url('$trans_image_path'); "; |
827
|
|
|
break; |
828
|
|
|
case 'bgcolor': |
829
|
|
|
$has_bgc_stl = true; |
830
|
|
|
$styledef .= "background-color: $attvalue; "; |
831
|
|
|
break; |
832
|
|
|
case 'text': |
833
|
|
|
$has_txt_stl = true; |
834
|
|
|
$styledef .= "color: $attvalue; "; |
835
|
|
|
break; |
836
|
|
|
} |
837
|
|
|
} |
838
|
|
|
// Outlook defines a white bgcolor and no text color. This can lead to |
839
|
|
|
// white text on a white bg with certain themes. |
840
|
|
|
if ($has_bgc_stl && !$has_txt_stl) { |
841
|
|
|
$styledef .= "color: $text; "; |
842
|
|
|
} |
843
|
|
|
if (mb_strlen($styledef) > 0) { |
844
|
|
|
$divattary['style'] = "\"$styledef\""; |
845
|
|
|
} |
846
|
|
|
} |
847
|
|
|
|
848
|
|
|
return $divattary; |
849
|
|
|
} |
850
|
|
|
|
851
|
|
|
/** |
852
|
|
|
* @param string $body The HTML you wish to filter |
853
|
|
|
* @param array $tag_list see description above |
854
|
|
|
* @param string[] $rm_tags_with_content see description above |
855
|
|
|
* @param string[] $self_closing_tags see description above |
856
|
|
|
* @param bool $force_tag_closing see description above |
857
|
|
|
* @param array $rm_attnames see description above |
858
|
|
|
* @param array $bad_attvals see description above |
859
|
|
|
* @param array $add_attr_to_tag see description above |
860
|
|
|
* @param string $trans_image_path |
861
|
|
|
* @param bool $block_external_images |
862
|
|
|
* @return string Sanitized html safe to show on your pages. |
863
|
|
|
*/ |
864
|
|
|
function tln_sanitize( |
865
|
|
|
$body, |
866
|
|
|
$tag_list, |
867
|
|
|
$rm_tags_with_content, |
868
|
|
|
$self_closing_tags, |
869
|
|
|
$force_tag_closing, |
870
|
|
|
$rm_attnames, |
871
|
|
|
$bad_attvals, |
872
|
|
|
$add_attr_to_tag, |
873
|
|
|
$trans_image_path, |
874
|
|
|
$block_external_images) |
875
|
|
|
{ |
876
|
|
|
/** |
877
|
|
|
* Normalize rm_tags and rm_tags_with_content. |
878
|
|
|
*/ |
879
|
|
|
$rm_tags = array_shift($tag_list); |
880
|
|
|
@array_walk($tag_list, 'tln_casenormalize'); |
|
|
|
|
881
|
|
|
@array_walk($rm_tags_with_content, 'tln_casenormalize'); |
|
|
|
|
882
|
|
|
@array_walk($self_closing_tags, 'tln_casenormalize'); |
|
|
|
|
883
|
|
|
/** |
884
|
|
|
* See if tag_list is of tags to remove or tags to allow. |
885
|
|
|
* false means remove these tags |
886
|
|
|
* true means allow these tags |
887
|
|
|
*/ |
888
|
|
|
$curpos = 0; |
889
|
|
|
$open_tags = []; |
890
|
|
|
$trusted = "<!-- begin tln_sanitized html -->\n"; |
891
|
|
|
$skip_content = false; |
892
|
|
|
/** |
893
|
|
|
* Take care of netscape's stupid javascript entities like |
894
|
|
|
* &{alert('boo')}; |
895
|
|
|
*/ |
896
|
|
|
$body = preg_replace('/&(\{.*?\};)/si', '&\\1', $body); |
897
|
|
|
while (false != ($curtag = tln_getnxtag($body, $curpos))) { |
898
|
|
|
list($tagname, $attary, $tagtype, $lt, $gt) = $curtag; |
899
|
|
|
$free_content = mb_substr($body, $curpos, $lt - $curpos); |
900
|
|
|
/** |
901
|
|
|
* Take care of <style> |
902
|
|
|
*/ |
903
|
|
|
if ('style' == $tagname && 1 == $tagtype) { |
904
|
|
|
list($free_content, $curpos) = tln_fixstyle($body, $gt + 1, $trans_image_path, $block_external_images); |
905
|
|
|
if (false != $free_content) { |
906
|
|
|
if (!empty($attary)) { |
907
|
|
|
$attary = tln_fixatts($tagname, $attary, $rm_attnames, $bad_attvals, $add_attr_to_tag, $trans_image_path, $block_external_images); |
908
|
|
|
} |
909
|
|
|
$trusted .= tln_tagprint($tagname, $attary, $tagtype); |
910
|
|
|
$trusted .= $free_content; |
911
|
|
|
$trusted .= tln_tagprint($tagname, null, 2); |
|
|
|
|
912
|
|
|
} |
913
|
|
|
continue; |
914
|
|
|
} |
915
|
|
|
if (false == $skip_content) { |
916
|
|
|
$trusted .= $free_content; |
917
|
|
|
} |
918
|
|
|
if (false != $tagname) { |
919
|
|
|
if (2 == $tagtype) { |
920
|
|
|
if ($skip_content == $tagname) { |
921
|
|
|
/** |
922
|
|
|
* Got to the end of tag we needed to remove. |
923
|
|
|
*/ |
924
|
|
|
$tagname = false; |
925
|
|
|
$skip_content = false; |
926
|
|
|
} else { |
927
|
|
|
if (false == $skip_content) { |
928
|
|
|
if ('body' == $tagname) { |
929
|
|
|
$tagname = 'div'; |
930
|
|
|
} |
931
|
|
View Code Duplication |
if (isset($open_tags[$tagname]) |
|
|
|
|
932
|
|
|
&& $open_tags[$tagname] > 0) { |
933
|
|
|
$open_tags[$tagname]--; |
934
|
|
|
} else { |
935
|
|
|
$tagname = false; |
936
|
|
|
} |
937
|
|
|
} |
938
|
|
|
} |
939
|
|
|
} else { |
940
|
|
|
/** |
941
|
|
|
* $rm_tags_with_content |
942
|
|
|
*/ |
943
|
|
|
if (false == $skip_content) { |
944
|
|
|
/** |
945
|
|
|
* See if this is a self-closing type and change |
946
|
|
|
* tagtype appropriately. |
947
|
|
|
*/ |
948
|
|
|
if (1 == $tagtype |
949
|
|
|
&& in_array($tagname, $self_closing_tags, true)) { |
950
|
|
|
$tagtype = 3; |
951
|
|
|
} |
952
|
|
|
/** |
953
|
|
|
* See if we should skip this tag and any content |
954
|
|
|
* inside it. |
955
|
|
|
*/ |
956
|
|
|
if (1 == $tagtype |
957
|
|
|
&& in_array($tagname, $rm_tags_with_content, true)) { |
958
|
|
|
$skip_content = $tagname; |
959
|
|
|
} else { |
960
|
|
|
if ((false == $rm_tags |
961
|
|
|
&& in_array($tagname, $tag_list, true)) |
962
|
|
|
|| (true == $rm_tags |
963
|
|
|
&& !in_array($tagname, $tag_list, true))) { |
964
|
|
|
$tagname = false; |
965
|
|
|
} else { |
966
|
|
|
/** |
967
|
|
|
* Convert body into div. |
968
|
|
|
*/ |
969
|
|
|
if ('body' == $tagname) { |
970
|
|
|
$tagname = 'div'; |
971
|
|
|
$attary = tln_body2div($attary, $trans_image_path); |
972
|
|
|
} |
973
|
|
View Code Duplication |
if (1 == $tagtype) { |
|
|
|
|
974
|
|
|
if (isset($open_tags[$tagname])) { |
975
|
|
|
$open_tags[$tagname]++; |
976
|
|
|
} else { |
977
|
|
|
$open_tags[$tagname] = 1; |
978
|
|
|
} |
979
|
|
|
} |
980
|
|
|
/** |
981
|
|
|
* This is where we run other checks. |
982
|
|
|
*/ |
983
|
|
|
if (is_array($attary) && count($attary) > 0) { |
984
|
|
|
$attary = tln_fixatts($tagname, $attary, $rm_attnames, $bad_attvals, $add_attr_to_tag, $trans_image_path, $block_external_images); |
985
|
|
|
} |
986
|
|
|
} |
987
|
|
|
} |
988
|
|
|
} |
989
|
|
|
} |
990
|
|
|
if (false != $tagname && false == $skip_content) { |
991
|
|
|
$trusted .= tln_tagprint($tagname, $attary, $tagtype); |
992
|
|
|
} |
993
|
|
|
} |
994
|
|
|
$curpos = $gt + 1; |
995
|
|
|
} |
996
|
|
|
$trusted .= mb_substr($body, $curpos, mb_strlen($body) - $curpos); |
997
|
|
|
if (true == $force_tag_closing) { |
|
|
|
|
998
|
|
|
foreach ($open_tags as $tagname => $opentimes) { |
999
|
|
|
while ($opentimes > 0) { |
1000
|
|
|
$trusted .= '</' . $tagname . '>'; |
1001
|
|
|
$opentimes--; |
1002
|
|
|
} |
1003
|
|
|
} |
1004
|
|
|
$trusted .= "\n"; |
1005
|
|
|
} |
1006
|
|
|
$trusted .= "<!-- end tln_sanitized html -->\n"; |
1007
|
|
|
|
1008
|
|
|
return $trusted; |
1009
|
|
|
} |
1010
|
|
|
|
1011
|
|
|
// |
1012
|
|
|
// Use the nifty htmlfilter library |
1013
|
|
|
// |
1014
|
|
|
|
1015
|
|
|
function HTMLFilter($body, $trans_image_path, $block_external_images = false) |
1016
|
|
|
{ |
1017
|
|
|
$tag_list = [ |
1018
|
|
|
false, |
1019
|
|
|
'object', |
1020
|
|
|
'meta', |
1021
|
|
|
'html', |
1022
|
|
|
'head', |
1023
|
|
|
'base', |
1024
|
|
|
'link', |
1025
|
|
|
'frame', |
1026
|
|
|
'iframe', |
1027
|
|
|
'plaintext', |
1028
|
|
|
'marquee', |
1029
|
|
|
]; |
1030
|
|
|
|
1031
|
|
|
$rm_tags_with_content = [ |
1032
|
|
|
'script', |
1033
|
|
|
'applet', |
1034
|
|
|
'embed', |
1035
|
|
|
'title', |
1036
|
|
|
'frameset', |
1037
|
|
|
'xmp', |
1038
|
|
|
'xml', |
1039
|
|
|
]; |
1040
|
|
|
|
1041
|
|
|
$self_closing_tags = [ |
1042
|
|
|
'img', |
1043
|
|
|
'br', |
1044
|
|
|
'hr', |
1045
|
|
|
'input', |
1046
|
|
|
'outbind', |
1047
|
|
|
]; |
1048
|
|
|
|
1049
|
|
|
$force_tag_closing = true; |
1050
|
|
|
|
1051
|
|
|
$rm_attnames = [ |
1052
|
|
|
'/.*/' => [ |
1053
|
|
|
// "/target/i", |
1054
|
|
|
'/^on.*/i', |
1055
|
|
|
'/^dynsrc/i', |
1056
|
|
|
'/^data.*/i', |
1057
|
|
|
'/^lowsrc.*/i', |
1058
|
|
|
], |
1059
|
|
|
]; |
1060
|
|
|
|
1061
|
|
|
$bad_attvals = [ |
1062
|
|
|
'/.*/' => [ |
1063
|
|
|
'/^src|background/i' => [ |
1064
|
|
|
[ |
1065
|
|
|
'/^([\'"])\s*\S+script\s*:.*([\'"])/si', |
1066
|
|
|
'/^([\'"])\s*mocha\s*:*.*([\'"])/si', |
1067
|
|
|
'/^([\'"])\s*about\s*:.*([\'"])/si', |
1068
|
|
|
], |
1069
|
|
|
[ |
1070
|
|
|
"\\1$trans_image_path\\2", |
1071
|
|
|
"\\1$trans_image_path\\2", |
1072
|
|
|
"\\1$trans_image_path\\2", |
1073
|
|
|
], |
1074
|
|
|
], |
1075
|
|
|
'/^href|action/i' => [ |
1076
|
|
|
[ |
1077
|
|
|
'/^([\'"])\s*\S+script\s*:.*([\'"])/si', |
1078
|
|
|
'/^([\'"])\s*mocha\s*:*.*([\'"])/si', |
1079
|
|
|
'/^([\'"])\s*about\s*:.*([\'"])/si', |
1080
|
|
|
], |
1081
|
|
|
[ |
1082
|
|
|
'\\1#\\1', |
1083
|
|
|
'\\1#\\1', |
1084
|
|
|
'\\1#\\1', |
1085
|
|
|
], |
1086
|
|
|
], |
1087
|
|
|
'/^style/i' => [ |
1088
|
|
|
[ |
1089
|
|
|
"/\/\*.*\*\//", |
1090
|
|
|
'/expression/i', |
1091
|
|
|
'/binding/i', |
1092
|
|
|
'/behaviou*r/i', |
1093
|
|
|
'/include-source/i', |
1094
|
|
|
'/position\s*:/i', |
1095
|
|
|
'/(\\\\)?u(\\\\)?r(\\\\)?l(\\\\)?/i', |
1096
|
|
|
'/url\s*\(\s*([\'"])\s*\S+script\s*:.*([\'"])\s*\)/si', |
1097
|
|
|
'/url\s*\(\s*([\'"])\s*mocha\s*:.*([\'"])\s*\)/si', |
1098
|
|
|
'/url\s*\(\s*([\'"])\s*about\s*:.*([\'"])\s*\)/si', |
1099
|
|
|
'/(.*)\s*:\s*url\s*\(\s*([\'"]*)\s*\S+script\s*:.*([\'"]*)\s*\)/si', |
1100
|
|
|
], |
1101
|
|
|
[ |
1102
|
|
|
'', |
1103
|
|
|
'idiocy', |
1104
|
|
|
'idiocy', |
1105
|
|
|
'idiocy', |
1106
|
|
|
'idiocy', |
1107
|
|
|
'idiocy', |
1108
|
|
|
'url', |
1109
|
|
|
'url(\\1#\\1)', |
1110
|
|
|
'url(\\1#\\1)', |
1111
|
|
|
'url(\\1#\\1)', |
1112
|
|
|
'\\1:url(\\2#\\3)', |
1113
|
|
|
], |
1114
|
|
|
], |
1115
|
|
|
], |
1116
|
|
|
]; |
1117
|
|
|
|
1118
|
|
|
if ($block_external_images) { |
1119
|
|
|
array_push($bad_attvals['/.*/']['/^src|background/i'][0], '/^([\'\"])\s*https*:.*([\'\"])/si'); |
1120
|
|
|
array_push($bad_attvals['/.*/']['/^src|background/i'][1], "\\1$trans_image_path\\1"); |
1121
|
|
|
array_push($bad_attvals['/.*/']['/^style/i'][0], '/url\(([\'\"])\s*https*:.*([\'\"])\)/si'); |
1122
|
|
|
array_push($bad_attvals['/.*/']['/^style/i'][1], "url(\\1$trans_image_path\\1)"); |
1123
|
|
|
} |
1124
|
|
|
|
1125
|
|
|
$add_attr_to_tag = [ |
1126
|
|
|
'/^a$/i' => ['target' => '"_blank"'], |
1127
|
|
|
]; |
1128
|
|
|
|
1129
|
|
|
$trusted = tln_sanitize($body, $tag_list, $rm_tags_with_content, $self_closing_tags, $force_tag_closing, $rm_attnames, $bad_attvals, $add_attr_to_tag, $trans_image_path, $block_external_images); |
1130
|
|
|
|
1131
|
|
|
return $trusted; |
1132
|
|
|
} |
1133
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.