| Conditions | 18 |
| Paths | 131 |
| Total Lines | 239 |
| Code Lines | 96 |
| Lines | 0 |
| Ratio | 0 % |
| Changes | 1 | ||
| Bugs | 0 | Features | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
| 1 | <?php |
||
| 141 | public function clean(mixed $str, bool $isImage = false): mixed |
||
| 142 | { |
||
| 143 | if (is_array($str)) { |
||
| 144 | foreach ($str as $key => &$value) { |
||
| 145 | $str[$key] = $this->clean($value); |
||
| 146 | } |
||
| 147 | |||
| 148 | return $str; |
||
| 149 | } |
||
| 150 | |||
| 151 | if ($str === '' || $str === null || is_bool($str) || ! $str || is_numeric($str)) { |
||
| 152 | return $str; |
||
| 153 | } |
||
| 154 | |||
| 155 | // Remove Invisible Characters |
||
| 156 | $str = $this->removeInvisibleCharacters($str); |
||
| 157 | |||
| 158 | // URL Decode |
||
| 159 | // Just in case stuff like this is submitted: |
||
| 160 | // <a href="http://%77%77%77%2E%67%6F%6F%67%6C%65%2E%63%6F%6D">Google</a> |
||
| 161 | // Note: Use rawurldecode() so it does not remove plus signs |
||
| 162 | if (stripos($str, '%') !== false) { |
||
| 163 | do { |
||
| 164 | $oldStr = $str; |
||
| 165 | $rawStr = rawurldecode($str); |
||
| 166 | $str = (string) preg_replace_callback( |
||
| 167 | '#%(?:\s*[0-9a-f]){2,}#i', |
||
| 168 | [$this, 'urlDecodeSpaces'], |
||
| 169 | $rawStr |
||
| 170 | ); |
||
| 171 | } while ($oldStr !== $str); |
||
| 172 | unset($oldStr); |
||
| 173 | } |
||
| 174 | |||
| 175 | /* |
||
| 176 | * Convert character entities to ASCII |
||
| 177 | * |
||
| 178 | * This permits our tests below to work reliably. |
||
| 179 | * We only convert entities that are within tags since |
||
| 180 | * these are the ones that will pose security problems. |
||
| 181 | */ |
||
| 182 | $str = (string) preg_replace_callback( |
||
| 183 | "/[^a-z0-9>]+[a-z0-9]+=([\'\"]).*?\\1/si", |
||
| 184 | [$this, 'convertAttribute'], |
||
| 185 | $str |
||
| 186 | ); |
||
| 187 | |||
| 188 | $str = (string) preg_replace_callback( |
||
| 189 | '/<\w+.*/si', |
||
| 190 | [$this, 'decodeEntity'], |
||
| 191 | $str |
||
| 192 | ); |
||
| 193 | |||
| 194 | // Remove Invisible Characters Again! |
||
| 195 | $str = $this->removeInvisibleCharacters($str); |
||
| 196 | |||
| 197 | /* |
||
| 198 | * Convert all tabs to spaces |
||
| 199 | * |
||
| 200 | * This prevents strings like this: ja vascript |
||
| 201 | * NOTE: we deal with spaces between characters later. |
||
| 202 | * NOTE: preg_replace was found to be amazingly slow here on |
||
| 203 | * large blocks of data, so we use str_replace. |
||
| 204 | */ |
||
| 205 | $str = str_replace("\t", ' ', $str); |
||
| 206 | |||
| 207 | // Capture converted string for later comparison |
||
| 208 | $convertedString = $str; |
||
| 209 | |||
| 210 | // Remove Strings that are never allowed |
||
| 211 | $str = $this->removeForbiddenStrings($str); |
||
| 212 | |||
| 213 | /* |
||
| 214 | * Makes PHP tags safe |
||
| 215 | * Note: XML tags are inadvertently replaced too: |
||
| 216 | * <?xml |
||
| 217 | * |
||
| 218 | * But it doesn't seem to pose a problem. |
||
| 219 | */ |
||
| 220 | if ($isImage) { |
||
| 221 | // Images have a tendency to have the PHP short opening and |
||
| 222 | // closing tags every so often so we skip those and only |
||
| 223 | // do the long opening tags. |
||
| 224 | $str = (string) preg_replace( |
||
| 225 | '/<\?(php)/i', |
||
| 226 | '<?\\1', |
||
| 227 | $str |
||
| 228 | ); |
||
| 229 | } else { |
||
| 230 | $str = str_replace( |
||
| 231 | ['<?', '?' . '>'], |
||
| 232 | ['<?', '?>'], |
||
| 233 | $str |
||
| 234 | ); |
||
| 235 | } |
||
| 236 | |||
| 237 | /* |
||
| 238 | * Compact any exploded words |
||
| 239 | * |
||
| 240 | * This corrects words like: j a v a s c r i p t |
||
| 241 | * These words are compacted back to their correct state. |
||
| 242 | */ |
||
| 243 | $words = [ |
||
| 244 | 'javascript', 'expression', 'vbscript', 'jscript', 'wscript', |
||
| 245 | 'vbs', 'script', 'base64', 'applet', 'alert', 'document', |
||
| 246 | 'write', 'cookie', 'window', 'confirm', 'prompt', 'eval' |
||
| 247 | ]; |
||
| 248 | |||
| 249 | foreach ($words as $word) { |
||
| 250 | $word = implode('\s*', str_split($word)) . '\s*'; |
||
|
|
|||
| 251 | |||
| 252 | // We only want to do this when it is followed by a non-word character |
||
| 253 | // That way valid stuff like "dealer to" does not become "dealerto" |
||
| 254 | $str = (string) preg_replace_callback( |
||
| 255 | '#(' . substr($word, 0, -3) . ')(\W)#is', |
||
| 256 | [$this, 'compactExplodedWords'], |
||
| 257 | $str |
||
| 258 | ); |
||
| 259 | } |
||
| 260 | |||
| 261 | /* |
||
| 262 | * Remove disallowed Javascript in links or img tags |
||
| 263 | * We used to do some version comparisons and use of stripos(), |
||
| 264 | * but it is dog slow compared to these simplified non-capturing |
||
| 265 | * preg_match(), especially if the pattern exists in the string |
||
| 266 | * |
||
| 267 | * Note: It was reported that not only space characters, but all in |
||
| 268 | * the following pattern can be parsed as separators between a tag name |
||
| 269 | * and its attributes: [\d\s"\'`;,\/\=\(\x00\x0B\x09\x0C] |
||
| 270 | * ... however, remove invisible characters above already strips the |
||
| 271 | * hex-encoded ones, so we'll skip them below. |
||
| 272 | */ |
||
| 273 | do { |
||
| 274 | $original = $str; |
||
| 275 | |||
| 276 | if (preg_match('/<a/i', $str)) { |
||
| 277 | $str = (string) preg_replace_callback( |
||
| 278 | '#<a(?:rea)?[^a-z0-9>]+([^>]*?)(?:>|$)#si', |
||
| 279 | [$this, 'removeJsLink'], |
||
| 280 | $str |
||
| 281 | ); |
||
| 282 | } |
||
| 283 | |||
| 284 | if (preg_match('/<img/i', $str)) { |
||
| 285 | $str = (string) preg_replace_callback( |
||
| 286 | '#<img[^a-z0-9]+([^>]*?)(?:\s?/?>|$)#si', |
||
| 287 | [$this, 'removeJsImage'], |
||
| 288 | $str |
||
| 289 | ); |
||
| 290 | } |
||
| 291 | |||
| 292 | if (preg_match('/script|xss/i', $str)) { |
||
| 293 | $str = (string) preg_replace( |
||
| 294 | '#</*(?:script|xss).*?>#si', |
||
| 295 | '[removed]', |
||
| 296 | $str |
||
| 297 | ); |
||
| 298 | } |
||
| 299 | } while ($original !== $str); |
||
| 300 | unset($original); |
||
| 301 | |||
| 302 | /* |
||
| 303 | * Sanitize naughty HTML elements |
||
| 304 | * |
||
| 305 | * If a tag containing any of the words in the list |
||
| 306 | * below is found, the tag gets converted to entities. |
||
| 307 | * |
||
| 308 | * So this: <blink> |
||
| 309 | * Becomes: <blink> |
||
| 310 | */ |
||
| 311 | |||
| 312 | $pattern = '#' |
||
| 313 | . '<((?<slash>/*\s*)((?<tagName>[a-z0-9]+)(?=[^a-z0-9]|$)|.+)' // tag |
||
| 314 | // start and name, followed by a non-tag character |
||
| 315 | . '[^\s\042\047a-z0-9>/=]*' // a valid attribute character |
||
| 316 | // immediately after the tag would count as a separator |
||
| 317 | // optional attributes |
||
| 318 | . '(?<attributes>(?:[\s\042\047/=]*' // non-attribute characters, |
||
| 319 | // excluding > (tag close) for obvious reasons |
||
| 320 | . '[^\s\042\047>/=]+' // attribute characters |
||
| 321 | // optional attribute-value |
||
| 322 | . '(?:\s*=' // attribute-value separator |
||
| 323 | . '(?:[^\s\042\047=><`]+|\s*\042[^\042]*\042|\s*\047[^\047]' |
||
| 324 | . '*\047|\s*(?U:[^\s\042\047=><`]*))' // single, double or non-quoted value |
||
| 325 | . ')?' // end optional attribute-value group |
||
| 326 | . ')*)' // end optional attributes group |
||
| 327 | . '[^>]*)(?<closeTag>\>)?#isS'; |
||
| 328 | |||
| 329 | // Note: It would be nice to optimize this for speed, BUT |
||
| 330 | // only matching the naughty elements here results in |
||
| 331 | // false positives and in turn - vulnerabilities! |
||
| 332 | do { |
||
| 333 | $oldStr = $str; |
||
| 334 | $str = (string) preg_replace_callback( |
||
| 335 | $pattern, |
||
| 336 | [$this, 'sanitizeNaughtyHtml'], |
||
| 337 | $str |
||
| 338 | ); |
||
| 339 | } while ($oldStr !== $str); |
||
| 340 | unset($oldStr); |
||
| 341 | |||
| 342 | /* |
||
| 343 | * Sanitize naughty scripting elements |
||
| 344 | * |
||
| 345 | * Similar to above, only instead of looking for |
||
| 346 | * tags it looks for PHP and JavaScript commands |
||
| 347 | * that are disallowed. Rather than removing the |
||
| 348 | * code, it simply converts the parenthesis to entities |
||
| 349 | * rendering the code un-executable. |
||
| 350 | * |
||
| 351 | * For example: eval('some code') |
||
| 352 | * Becomes: eval('some code') |
||
| 353 | */ |
||
| 354 | $str = (string) preg_replace( |
||
| 355 | '#(alert|prompt|confirm|cmd|passthru|eval|exec|expression|system|' |
||
| 356 | . 'fopen|fsockopen|file|file_get_contents|readfile|unlink)(\s*)\((.*?)\)#si', |
||
| 357 | '\\1\\2(\\3)', |
||
| 358 | $str |
||
| 359 | ); |
||
| 360 | |||
| 361 | // Final clean up |
||
| 362 | // This adds a bit of extra precaution in case |
||
| 363 | // something got through the above filters |
||
| 364 | $str = $this->removeForbiddenStrings($str); |
||
| 365 | |||
| 366 | /* |
||
| 367 | * Images are Handled in a Special Way |
||
| 368 | * - Essentially, we want to know that after all of the character |
||
| 369 | * conversion is done whether any unwanted, likely XSS, code was found. |
||
| 370 | * If not, we return TRUE, as the image is clean. |
||
| 371 | * However, if the string post-conversion does not matched the |
||
| 372 | * string post-removal of XSS, then it fails, as there was unwanted XSS |
||
| 373 | * code found and removed/changed during processing. |
||
| 374 | */ |
||
| 375 | if ($isImage) { |
||
| 376 | return ($str === $convertedString); |
||
| 377 | } |
||
| 378 | |||
| 379 | return $str; |
||
| 380 | } |
||
| 792 |