| Total Complexity | 43 |
| Total Lines | 571 |
| Duplicated Lines | 0 % |
| Changes | 0 | ||
Complex classes like Xss often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Xss, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 21 | class Xss |
||
| 22 | {
|
||
| 23 | /** |
||
| 24 | * Clean |
||
| 25 | * |
||
| 26 | * @param string $string |
||
| 27 | * @param boolean $isImage |
||
| 28 | * |
||
| 29 | * @return string |
||
| 30 | */ |
||
| 31 | public static function clean($string, $isImage = false) |
||
| 32 | {
|
||
| 33 | // Is the string an array? |
||
| 34 | if (is_array($string)) {
|
||
|
|
|||
| 35 | while (list($key) = each($string)) {
|
||
| 36 | $string[ $key ] = self::clean($string[ $key ]); |
||
| 37 | } |
||
| 38 | |||
| 39 | return $string; |
||
| 40 | } |
||
| 41 | |||
| 42 | // Remove Invisible Characters |
||
| 43 | $string = remove_invisible_characters($string); |
||
| 44 | |||
| 45 | /* |
||
| 46 | * URL Decode |
||
| 47 | * |
||
| 48 | * Just in case stuff like this is submitted: |
||
| 49 | * |
||
| 50 | * <a href="http://%77%77%77%2E%67%6F%6F%67%6C%65%2E%63%6F%6D">Google</a> |
||
| 51 | * |
||
| 52 | * Note: Use rawurldecode() so it does not remove plus signs |
||
| 53 | */ |
||
| 54 | do {
|
||
| 55 | $string = rawurldecode($string); |
||
| 56 | } while (preg_match('/%[0-9a-f]{2,}/i', $string));
|
||
| 57 | |||
| 58 | /* |
||
| 59 | * Convert character entities to ASCII |
||
| 60 | * |
||
| 61 | * This permits our tests below to work reliably. |
||
| 62 | * We only convert entities that are within tags since |
||
| 63 | * these are the ones that will pose security problems. |
||
| 64 | */ |
||
| 65 | $string = preg_replace_callback( |
||
| 66 | "/[^a-z0-9>]+[a-z0-9]+=([\'\"]).*?\\1/si", |
||
| 67 | [self::class, 'convertAttribute'], |
||
| 68 | $string |
||
| 69 | ); |
||
| 70 | |||
| 71 | $string = preg_replace_callback('/<\w+.*/si', [self::class, 'decodeEntity'], $string);
|
||
| 72 | |||
| 73 | // Remove Invisible Characters Again! |
||
| 74 | $string = remove_invisible_characters($string); |
||
| 75 | |||
| 76 | /* |
||
| 77 | * Convert all tabs to spaces |
||
| 78 | * |
||
| 79 | * This prevents strings like this: ja vascript |
||
| 80 | * NOTE: we deal with spaces between characters later. |
||
| 81 | * NOTE: preg_replace was found to be amazingly slow here on |
||
| 82 | * large blocks of data, so we use str_replace. |
||
| 83 | */ |
||
| 84 | $string = str_replace("\t", ' ', $string);
|
||
| 85 | |||
| 86 | // Capture converted string for later comparison |
||
| 87 | $convertedString = $string; |
||
| 88 | |||
| 89 | // Remove Strings that are never allowed |
||
| 90 | $string = self::doNeverAllowed($string); |
||
| 91 | |||
| 92 | /* |
||
| 93 | * Makes PHP tags safe |
||
| 94 | * |
||
| 95 | * Note: XML tags are inadvertently replaced too: |
||
| 96 | * |
||
| 97 | * <?xml |
||
| 98 | * |
||
| 99 | * But it doesn't seem to pose a problem. |
||
| 100 | */ |
||
| 101 | if ($isImage === true) {
|
||
| 102 | // Images have a tendency to have the PHP short opening and |
||
| 103 | // closing tags every so often so we skip those and only |
||
| 104 | // do the long opening tags. |
||
| 105 | $string = preg_replace('/<\?(php)/i', '<?\\1', $string);
|
||
| 106 | } else {
|
||
| 107 | $string = str_replace(['<?', '?' . '>'], ['<?', '?>'], $string); |
||
| 108 | } |
||
| 109 | |||
| 110 | /* |
||
| 111 | * Compact any exploded words |
||
| 112 | * |
||
| 113 | * This corrects words like: j a v a s c r i p t |
||
| 114 | * These words are compacted back to their correct state. |
||
| 115 | */ |
||
| 116 | $words = [ |
||
| 117 | 'javascript', |
||
| 118 | 'expression', |
||
| 119 | 'vbscript', |
||
| 120 | 'jscript', |
||
| 121 | 'wscript', |
||
| 122 | 'vbs', |
||
| 123 | 'script', |
||
| 124 | 'base64', |
||
| 125 | 'applet', |
||
| 126 | 'alert', |
||
| 127 | 'document', |
||
| 128 | 'write', |
||
| 129 | 'cookie', |
||
| 130 | 'window', |
||
| 131 | 'confirm', |
||
| 132 | 'prompt', |
||
| 133 | 'eval', |
||
| 134 | ]; |
||
| 135 | |||
| 136 | foreach ($words as $word) {
|
||
| 137 | $word = implode('\s*', str_split($word)) . '\s*';
|
||
| 138 | |||
| 139 | // We only want to do this when it is followed by a non-word character |
||
| 140 | // That way valid stuff like "dealer to" does not become "dealerto" |
||
| 141 | $string = preg_replace_callback( |
||
| 142 | '#(' . substr($word, 0, -3) . ')(\W)#is',
|
||
| 143 | [self::class, 'compactExplodedWords'], |
||
| 144 | $string |
||
| 145 | ); |
||
| 146 | } |
||
| 147 | |||
| 148 | /* |
||
| 149 | * Remove disallowed Javascript in links or img tags |
||
| 150 | * We used to do some version comparisons and use of stripos(), |
||
| 151 | * but it is dog slow compared to these simplified non-capturing |
||
| 152 | * preg_match(), especially if the pattern exists in the string |
||
| 153 | * |
||
| 154 | * Note: It was reported that not only space characters, but all in |
||
| 155 | * the following pattern can be parsed as separators between a tag name |
||
| 156 | * and its attributes: [\d\s"\'`;,\/\=\(\x00\x0B\x09\x0C] |
||
| 157 | * ... however, remove_invisible_characters() above already strips the |
||
| 158 | * hex-encoded ones, so we'll skip them below. |
||
| 159 | */ |
||
| 160 | do {
|
||
| 161 | $original = $string; |
||
| 162 | if (preg_match('/<a/i', $string)) {
|
||
| 163 | $string = preg_replace_callback( |
||
| 164 | '#<a[^a-z0-9>]+([^>]*?)(?:>|$)#si', |
||
| 165 | [self::class, 'jsLinkRemoval'], |
||
| 166 | $string |
||
| 167 | ); |
||
| 168 | } |
||
| 169 | if (preg_match('/<img/i', $string)) {
|
||
| 170 | $string = preg_replace_callback( |
||
| 171 | '#<img[^a-z0-9]+([^>]*?)(?:\s?/?>|$)#si', |
||
| 172 | [self::class, 'jsImgRemoval'], |
||
| 173 | $string |
||
| 174 | ); |
||
| 175 | } |
||
| 176 | if (preg_match('/script|xss/i', $string)) {
|
||
| 177 | $string = preg_replace('#</*(?:script|xss).*?>#si', '[removed]', $string);
|
||
| 178 | } |
||
| 179 | } while ($original !== $string); |
||
| 180 | unset($original); |
||
| 181 | |||
| 182 | /* |
||
| 183 | * Sanitize naughty HTML elements |
||
| 184 | * |
||
| 185 | * If a tag containing any of the words in the list |
||
| 186 | * below is found, the tag gets converted to entities. |
||
| 187 | * |
||
| 188 | * So this: <blink> |
||
| 189 | * Becomes: <blink> |
||
| 190 | */ |
||
| 191 | $pattern = '#' |
||
| 192 | . '<((?<slash>/*\s*)(?<tagName>[a-z0-9]+)(?=[^a-z0-9]|$)' |
||
| 193 | // tag start and name, followed by a non-tag character |
||
| 194 | . '[^\s\042\047a-z0-9>/=]*' |
||
| 195 | // a valid attribute character immediately after the tag would count as a separator |
||
| 196 | // optional attributes |
||
| 197 | . '(?<attributes>(?:[\s\042\047/=]*' |
||
| 198 | // non-attribute characters, excluding > (tag close) for obvious reasons |
||
| 199 | . '[^\s\042\047>/=]+' |
||
| 200 | // attribute characters |
||
| 201 | // optional attribute-value |
||
| 202 | . '(?:\s*=' |
||
| 203 | // attribute-value separator |
||
| 204 | . '(?:[^\s\042\047=><`]+|\s*\042[^\042]*\042|\s*\047[^\047]*\047|\s*(?U:[^\s\042\047=><`]*))' |
||
| 205 | // single, double or non-quoted value |
||
| 206 | . ')?' |
||
| 207 | // end optional attribute-value group |
||
| 208 | . ')*)' |
||
| 209 | // end optional attributes group |
||
| 210 | . '[^>]*)(?<closeTag>\>)?#isS'; |
||
| 211 | // Note: It would be nice to optimize this for speed, BUT |
||
| 212 | // only matching the naughty elements here results in |
||
| 213 | // false positives and in turn - vulnerabilities! |
||
| 214 | do {
|
||
| 215 | $oldString = $string; |
||
| 216 | $string = preg_replace_callback($pattern, [self::class, 'sanitizeNaughtyHTML'], $string); |
||
| 217 | } while ($oldString !== $string); |
||
| 218 | |||
| 219 | unset($oldString); |
||
| 220 | |||
| 221 | /* |
||
| 222 | * Sanitize naughty scripting elements |
||
| 223 | * |
||
| 224 | * Similar to above, only instead of looking for |
||
| 225 | * tags it looks for PHP and JavaScript commands |
||
| 226 | * that are disallowed. Rather than removing the |
||
| 227 | * code, it simply converts the parenthesis to entities |
||
| 228 | * rendering the code un-executable. |
||
| 229 | * |
||
| 230 | * For example: eval('some code')
|
||
| 231 | * Becomes: eval('some code') |
||
| 232 | */ |
||
| 233 | $string = preg_replace( |
||
| 234 | '#(alert|prompt|confirm|cmd|passthru|eval|exec|expression|system|fopen|fsockopen|file|file_get_contents|readfile|unlink)(\s*)\((.*?)\)#si', |
||
| 235 | '\\1\\2(\\3)', |
||
| 236 | $string |
||
| 237 | ); |
||
| 238 | |||
| 239 | // Final clean up |
||
| 240 | // This adds a bit of extra precaution in case |
||
| 241 | // something got through the above filters |
||
| 242 | $string = self::doNeverAllowed($string); |
||
| 243 | |||
| 244 | /* |
||
| 245 | * Images are Handled in a Special Way |
||
| 246 | * - Essentially, we want to know that after all of the character |
||
| 247 | * conversion is done whether any unwanted, likely XSS, code was found. |
||
| 248 | * If not, we return TRUE, as the image is clean. |
||
| 249 | * However, if the string post-conversion does not matched the |
||
| 250 | * string post-removal of XSS, then it fails, as there was unwanted XSS |
||
| 251 | * code found and removed/changed during processing. |
||
| 252 | */ |
||
| 253 | if ($isImage === true) {
|
||
| 254 | return ($string === $convertedString); |
||
| 255 | } |
||
| 256 | |||
| 257 | return $string; |
||
| 258 | } |
||
| 259 | |||
| 260 | /** |
||
| 261 | * Do Never Allowed |
||
| 262 | * |
||
| 263 | * @used-by XSS::clean() |
||
| 264 | * |
||
| 265 | * @param string |
||
| 266 | * |
||
| 267 | * @return string |
||
| 268 | */ |
||
| 269 | protected static function doNeverAllowed($string) |
||
| 270 | {
|
||
| 271 | $string = str_replace( |
||
| 272 | array_keys(self::getConfig('never_allowed_strings')),
|
||
| 273 | self::getConfig('never_allowed_strings'),
|
||
| 274 | $string |
||
| 275 | ); |
||
| 276 | |||
| 277 | foreach (self::getConfig('never_allowed_regex') as $regex) {
|
||
| 278 | $string = preg_replace('#' . $regex . '#is', '[removed]', $string);
|
||
| 279 | } |
||
| 280 | |||
| 281 | return $string; |
||
| 282 | } |
||
| 283 | |||
| 284 | // -------------------------------------------------------------------------------------- |
||
| 285 | |||
| 286 | protected function getConfig($index) |
||
| 287 | {
|
||
| 288 | static $config; |
||
| 289 | |||
| 290 | if (empty($config)) {
|
||
| 291 | $config = require('../Config/Xss.php');
|
||
| 292 | } |
||
| 293 | |||
| 294 | return $config[ $index ]; |
||
| 295 | } |
||
| 296 | |||
| 297 | // -------------------------------------------------------------------- |
||
| 298 | |||
| 299 | /** |
||
| 300 | * Compact Exploded Words |
||
| 301 | * |
||
| 302 | * Callback method for xss_clean() to remove whitespace from |
||
| 303 | * things like 'j a v a s c r i p t'. |
||
| 304 | * |
||
| 305 | * @used-by XSS::clean() |
||
| 306 | * |
||
| 307 | * @param array $matches |
||
| 308 | * |
||
| 309 | * @return string |
||
| 310 | */ |
||
| 311 | protected static function compactExplodedWords($matches) |
||
| 312 | {
|
||
| 313 | return preg_replace('/\s+/s', '', $matches[ 1 ]) . $matches[ 2 ];
|
||
| 314 | } |
||
| 315 | |||
| 316 | // -------------------------------------------------------------------- |
||
| 317 | |||
| 318 | /** |
||
| 319 | * Sanitize Naughty HTML |
||
| 320 | * |
||
| 321 | * Callback method for xss_clean() to remove naughty HTML elements. |
||
| 322 | * |
||
| 323 | * @used-by XSS::clean() |
||
| 324 | * |
||
| 325 | * @param array $matches |
||
| 326 | * |
||
| 327 | * @return string |
||
| 328 | */ |
||
| 329 | protected static function sanitizeNaughtyHTML($matches) |
||
| 330 | {
|
||
| 331 | // First, escape unclosed tags |
||
| 332 | if (empty($matches[ 'closeTag' ])) {
|
||
| 333 | return '<' . $matches[ 1 ]; |
||
| 334 | } // Is the element that we caught naughty? If so, escape it |
||
| 335 | elseif (in_array(strtolower($matches[ 'tagName' ]), self::getConfig('naughty_tags'), true)) {
|
||
| 336 | return '<' . $matches[ 1 ] . '>'; |
||
| 337 | } // For other tags, see if their attributes are "evil" and strip those |
||
| 338 | elseif (isset($matches[ 'attributes' ])) {
|
||
| 339 | // We'll store the already fitlered attributes here |
||
| 340 | $attributes = []; |
||
| 341 | |||
| 342 | // Attribute-catching pattern |
||
| 343 | $attributesPattern = '#' |
||
| 344 | . '(?<name>[^\s\042\047>/=]+)' |
||
| 345 | // attribute characters |
||
| 346 | // optional attribute-value |
||
| 347 | . '(?:\s*=(?<value>[^\s\042\047=><`]+|\s*\042[^\042]*\042|\s*\047[^\047]*\047|\s*(?U:[^\s\042\047=><`]*)))' |
||
| 348 | // attribute-value separator |
||
| 349 | . '#i'; |
||
| 350 | |||
| 351 | // Blacklist pattern for evil attribute names |
||
| 352 | $is_evil_pattern = '#^(' . implode('|', self::getConfig('evil_attributes')) . ')$#i';
|
||
| 353 | |||
| 354 | // Each iteration filters a single attribute |
||
| 355 | do {
|
||
| 356 | // Strip any non-alpha characters that may preceed an attribute. |
||
| 357 | // Browsers often parse these incorrectly and that has been a |
||
| 358 | // of numerous XSS issues we've had. |
||
| 359 | $matches[ 'attributes' ] = preg_replace('#^[^a-z]+#i', '', $matches[ 'attributes' ]);
|
||
| 360 | |||
| 361 | if ( ! preg_match($attributesPattern, $matches[ 'attributes' ], $attribute, PREG_OFFSET_CAPTURE)) {
|
||
| 362 | // No (valid) attribute found? Discard everything else inside the tag |
||
| 363 | break; |
||
| 364 | } |
||
| 365 | |||
| 366 | if ( |
||
| 367 | // Is it indeed an "evil" attribute? |
||
| 368 | preg_match($is_evil_pattern, $attribute[ 'name' ][ 0 ]) |
||
| 369 | // Or does it have an equals sign, but no value and not quoted? Strip that too! |
||
| 370 | OR (trim($attribute[ 'value' ][ 0 ]) === '') |
||
| 371 | ) {
|
||
| 372 | $attributes[] = 'xss=removed'; |
||
| 373 | } else {
|
||
| 374 | $attributes[] = $attribute[ 0 ][ 0 ]; |
||
| 375 | } |
||
| 376 | |||
| 377 | $matches[ 'attributes' ] = substr( |
||
| 378 | $matches[ 'attributes' ], |
||
| 379 | $attribute[ 0 ][ 1 ] + strlen($attribute[ 0 ][ 0 ]) |
||
| 380 | ); |
||
| 381 | } while ($matches[ 'attributes' ] !== ''); |
||
| 382 | $attributes = empty($attributes) |
||
| 383 | ? '' |
||
| 384 | : ' ' . implode(' ', $attributes);
|
||
| 385 | |||
| 386 | return '<' . $matches[ 'slash' ] . $matches[ 'tagName' ] . $attributes . '>'; |
||
| 387 | } |
||
| 388 | |||
| 389 | return $matches[ 0 ]; |
||
| 390 | } |
||
| 391 | |||
| 392 | // -------------------------------------------------------------------- |
||
| 393 | |||
| 394 | /** |
||
| 395 | * JS Link Removal |
||
| 396 | * |
||
| 397 | * Callback method for xss_clean() to sanitize links. |
||
| 398 | * |
||
| 399 | * This limits the PCRE backtracks, making it more performance friendly |
||
| 400 | * and prevents PREG_BACKTRACK_LIMIT_ERROR from being triggered in |
||
| 401 | * PHP 5.2+ on link-heavy strings. |
||
| 402 | * |
||
| 403 | * @used-by XSS::clean() |
||
| 404 | * |
||
| 405 | * @param array $match |
||
| 406 | * |
||
| 407 | * @return string |
||
| 408 | */ |
||
| 409 | protected static function jsLinkRemoval($match) |
||
| 410 | {
|
||
| 411 | return str_replace( |
||
| 412 | $match[ 1 ], |
||
| 413 | preg_replace( |
||
| 414 | '#href=.*?(?:(?:alert|prompt|confirm)(?:\(|&\#40;)|javascript:|livescript:|mocha:|charset=|window\.|document\.|\.cookie|<script|<xss|data\s*:)#si', |
||
| 415 | '', |
||
| 416 | self::filterAttributes(str_replace(['<', '>'], '', $match[ 1 ])) |
||
| 417 | ), |
||
| 418 | $match[ 0 ] |
||
| 419 | ); |
||
| 420 | } |
||
| 421 | |||
| 422 | // -------------------------------------------------------------------- |
||
| 423 | |||
| 424 | /** |
||
| 425 | * Filter Attributes |
||
| 426 | * |
||
| 427 | * Filters tag attributes for consistency and safety. |
||
| 428 | * |
||
| 429 | * @used-by Security::jsImgRemoval() |
||
| 430 | * @used-by Security::jsLinkRemoval() |
||
| 431 | * |
||
| 432 | * @param string $str |
||
| 433 | * |
||
| 434 | * @return string |
||
| 435 | */ |
||
| 436 | protected static function filterAttributes($str) |
||
| 437 | {
|
||
| 438 | $out = ''; |
||
| 439 | if (preg_match_all('#\s*[a-z\-]+\s*=\s*(\042|\047)([^\\1]*?)\\1#is', $str, $matches)) {
|
||
| 440 | foreach ($matches[ 0 ] as $match) {
|
||
| 441 | $out .= preg_replace('#/\*.*?\*/#s', '', $match);
|
||
| 442 | } |
||
| 443 | } |
||
| 444 | |||
| 445 | return $out; |
||
| 446 | } |
||
| 447 | |||
| 448 | // -------------------------------------------------------------------- |
||
| 449 | |||
| 450 | /** |
||
| 451 | * JS Image Removal |
||
| 452 | * |
||
| 453 | * Callback method for xss_clean() to sanitize image tags. |
||
| 454 | * |
||
| 455 | * This limits the PCRE backtracks, making it more performance friendly |
||
| 456 | * and prevents PREG_BACKTRACK_LIMIT_ERROR from being triggered in |
||
| 457 | * PHP 5.2+ on image tag heavy strings. |
||
| 458 | * |
||
| 459 | * @used-by XSS::clean() |
||
| 460 | * |
||
| 461 | * @param array $match |
||
| 462 | * |
||
| 463 | * @return string |
||
| 464 | */ |
||
| 465 | protected static function jsImgRemoval($match) |
||
| 466 | {
|
||
| 467 | return str_replace( |
||
| 468 | $match[ 1 ], |
||
| 469 | preg_replace( |
||
| 470 | '#src=.*?(?:(?:alert|prompt|confirm)(?:\(|&\#40;)|javascript:|livescript:|mocha:|charset=|window\.|document\.|\.cookie|<script|<xss|base64\s*,)#si', |
||
| 471 | '', |
||
| 472 | self::filterAttributes(str_replace(['<', '>'], '', $match[ 1 ])) |
||
| 473 | ), |
||
| 474 | $match[ 0 ] |
||
| 475 | ); |
||
| 476 | } |
||
| 477 | |||
| 478 | // -------------------------------------------------------------------- |
||
| 479 | |||
| 480 | /** |
||
| 481 | * Attribute Conversion |
||
| 482 | * |
||
| 483 | * @used-by XSS::clean() |
||
| 484 | * |
||
| 485 | * @param array $match |
||
| 486 | * |
||
| 487 | * @return string |
||
| 488 | */ |
||
| 489 | protected static function convertAttribute($match) |
||
| 490 | {
|
||
| 491 | return str_replace(['>', '<', '\\'], ['>', '<', '\\\\'], $match[ 0 ]); |
||
| 492 | } |
||
| 493 | |||
| 494 | // ------------------------------------------------------------------------ |
||
| 495 | |||
| 496 | /** |
||
| 497 | * HTML Entity Decode Callback |
||
| 498 | * |
||
| 499 | * @used-by XSS::clean() |
||
| 500 | * |
||
| 501 | * @param array $match |
||
| 502 | * |
||
| 503 | * @return string |
||
| 504 | */ |
||
| 505 | protected static function decodeEntity($match) |
||
| 521 | ); |
||
| 522 | } |
||
| 523 | |||
| 524 | // -------------------------------------------------------------------- |
||
| 525 | |||
| 526 | /** |
||
| 527 | * HTML Entities Decode |
||
| 528 | * |
||
| 529 | * A replacement for html_entity_decode() |
||
| 530 | * |
||
| 531 | * The reason we are not using html_entity_decode() by itself is because |
||
| 532 | * while it is not technically correct to leave out the semicolon |
||
| 533 | * at the end of an entity most browsers will still interpret the entity |
||
| 534 | * correctly. html_entity_decode() does not convert entities without |
||
| 535 | * semicolons, so we are left with our own little solution here. Bummer. |
||
| 536 | * |
||
| 537 | * @link http://php.net/html-entity-decode |
||
| 538 | * |
||
| 539 | * @param string $string Input |
||
| 540 | * @param string $charset Character set |
||
| 541 | * |
||
| 542 | * @return string |
||
| 543 | */ |
||
| 544 | protected static function entityDecode($string, $charset = null) |
||
| 592 | } |
||
| 593 | |||
| 594 | } |