Conditions | 12 |
Paths | 66 |
Total Lines | 227 |
Code Lines | 84 |
Lines | 0 |
Ratio | 0 % |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
1 | <?php |
||
31 | public static function clean($string, $isImage = false) |
||
32 | { |
||
33 | // Is the string an array? |
||
34 | if (is_array($string)) { |
||
|
|||
35 | while (list($key) = each($string)) { |
||
36 | $string[ $key ] = self::clean($string[ $key ]); |
||
37 | } |
||
38 | |||
39 | return $string; |
||
40 | } |
||
41 | |||
42 | // Remove Invisible Characters |
||
43 | $string = remove_invisible_characters($string); |
||
44 | |||
45 | /* |
||
46 | * URL Decode |
||
47 | * |
||
48 | * Just in case stuff like this is submitted: |
||
49 | * |
||
50 | * <a href="http://%77%77%77%2E%67%6F%6F%67%6C%65%2E%63%6F%6D">Google</a> |
||
51 | * |
||
52 | * Note: Use rawurldecode() so it does not remove plus signs |
||
53 | */ |
||
54 | do { |
||
55 | $string = rawurldecode($string); |
||
56 | } while (preg_match('/%[0-9a-f]{2,}/i', $string)); |
||
57 | |||
58 | /* |
||
59 | * Convert character entities to ASCII |
||
60 | * |
||
61 | * This permits our tests below to work reliably. |
||
62 | * We only convert entities that are within tags since |
||
63 | * these are the ones that will pose security problems. |
||
64 | */ |
||
65 | $string = preg_replace_callback( |
||
66 | "/[^a-z0-9>]+[a-z0-9]+=([\'\"]).*?\\1/si", |
||
67 | [self::class, 'convertAttribute'], |
||
68 | $string |
||
69 | ); |
||
70 | |||
71 | $string = preg_replace_callback('/<\w+.*/si', [self::class, 'decodeEntity'], $string); |
||
72 | |||
73 | // Remove Invisible Characters Again! |
||
74 | $string = remove_invisible_characters($string); |
||
75 | |||
76 | /* |
||
77 | * Convert all tabs to spaces |
||
78 | * |
||
79 | * This prevents strings like this: ja vascript |
||
80 | * NOTE: we deal with spaces between characters later. |
||
81 | * NOTE: preg_replace was found to be amazingly slow here on |
||
82 | * large blocks of data, so we use str_replace. |
||
83 | */ |
||
84 | $string = str_replace("\t", ' ', $string); |
||
85 | |||
86 | // Capture converted string for later comparison |
||
87 | $convertedString = $string; |
||
88 | |||
89 | // Remove Strings that are never allowed |
||
90 | $string = self::doNeverAllowed($string); |
||
91 | |||
92 | /* |
||
93 | * Makes PHP tags safe |
||
94 | * |
||
95 | * Note: XML tags are inadvertently replaced too: |
||
96 | * |
||
97 | * <?xml |
||
98 | * |
||
99 | * But it doesn't seem to pose a problem. |
||
100 | */ |
||
101 | if ($isImage === true) { |
||
102 | // Images have a tendency to have the PHP short opening and |
||
103 | // closing tags every so often so we skip those and only |
||
104 | // do the long opening tags. |
||
105 | $string = preg_replace('/<\?(php)/i', '<?\\1', $string); |
||
106 | } else { |
||
107 | $string = str_replace(['<?', '?' . '>'], ['<?', '?>'], $string); |
||
108 | } |
||
109 | |||
110 | /* |
||
111 | * Compact any exploded words |
||
112 | * |
||
113 | * This corrects words like: j a v a s c r i p t |
||
114 | * These words are compacted back to their correct state. |
||
115 | */ |
||
116 | $words = [ |
||
117 | 'javascript', |
||
118 | 'expression', |
||
119 | 'vbscript', |
||
120 | 'jscript', |
||
121 | 'wscript', |
||
122 | 'vbs', |
||
123 | 'script', |
||
124 | 'base64', |
||
125 | 'applet', |
||
126 | 'alert', |
||
127 | 'document', |
||
128 | 'write', |
||
129 | 'cookie', |
||
130 | 'window', |
||
131 | 'confirm', |
||
132 | 'prompt', |
||
133 | 'eval', |
||
134 | ]; |
||
135 | |||
136 | foreach ($words as $word) { |
||
137 | $word = implode('\s*', str_split($word)) . '\s*'; |
||
138 | |||
139 | // We only want to do this when it is followed by a non-word character |
||
140 | // That way valid stuff like "dealer to" does not become "dealerto" |
||
141 | $string = preg_replace_callback( |
||
142 | '#(' . substr($word, 0, -3) . ')(\W)#is', |
||
143 | [self::class, 'compactExplodedWords'], |
||
144 | $string |
||
145 | ); |
||
146 | } |
||
147 | |||
148 | /* |
||
149 | * Remove disallowed Javascript in links or img tags |
||
150 | * We used to do some version comparisons and use of stripos(), |
||
151 | * but it is dog slow compared to these simplified non-capturing |
||
152 | * preg_match(), especially if the pattern exists in the string |
||
153 | * |
||
154 | * Note: It was reported that not only space characters, but all in |
||
155 | * the following pattern can be parsed as separators between a tag name |
||
156 | * and its attributes: [\d\s"\'`;,\/\=\(\x00\x0B\x09\x0C] |
||
157 | * ... however, remove_invisible_characters() above already strips the |
||
158 | * hex-encoded ones, so we'll skip them below. |
||
159 | */ |
||
160 | do { |
||
161 | $original = $string; |
||
162 | if (preg_match('/<a/i', $string)) { |
||
163 | $string = preg_replace_callback( |
||
164 | '#<a[^a-z0-9>]+([^>]*?)(?:>|$)#si', |
||
165 | [self::class, 'jsLinkRemoval'], |
||
166 | $string |
||
167 | ); |
||
168 | } |
||
169 | if (preg_match('/<img/i', $string)) { |
||
170 | $string = preg_replace_callback( |
||
171 | '#<img[^a-z0-9]+([^>]*?)(?:\s?/?>|$)#si', |
||
172 | [self::class, 'jsImgRemoval'], |
||
173 | $string |
||
174 | ); |
||
175 | } |
||
176 | if (preg_match('/script|xss/i', $string)) { |
||
177 | $string = preg_replace('#</*(?:script|xss).*?>#si', '[removed]', $string); |
||
178 | } |
||
179 | } while ($original !== $string); |
||
180 | unset($original); |
||
181 | |||
182 | /* |
||
183 | * Sanitize naughty HTML elements |
||
184 | * |
||
185 | * If a tag containing any of the words in the list |
||
186 | * below is found, the tag gets converted to entities. |
||
187 | * |
||
188 | * So this: <blink> |
||
189 | * Becomes: <blink> |
||
190 | */ |
||
191 | $pattern = '#' |
||
192 | . '<((?<slash>/*\s*)(?<tagName>[a-z0-9]+)(?=[^a-z0-9]|$)' |
||
193 | // tag start and name, followed by a non-tag character |
||
194 | . '[^\s\042\047a-z0-9>/=]*' |
||
195 | // a valid attribute character immediately after the tag would count as a separator |
||
196 | // optional attributes |
||
197 | . '(?<attributes>(?:[\s\042\047/=]*' |
||
198 | // non-attribute characters, excluding > (tag close) for obvious reasons |
||
199 | . '[^\s\042\047>/=]+' |
||
200 | // attribute characters |
||
201 | // optional attribute-value |
||
202 | . '(?:\s*=' |
||
203 | // attribute-value separator |
||
204 | . '(?:[^\s\042\047=><`]+|\s*\042[^\042]*\042|\s*\047[^\047]*\047|\s*(?U:[^\s\042\047=><`]*))' |
||
205 | // single, double or non-quoted value |
||
206 | . ')?' |
||
207 | // end optional attribute-value group |
||
208 | . ')*)' |
||
209 | // end optional attributes group |
||
210 | . '[^>]*)(?<closeTag>\>)?#isS'; |
||
211 | // Note: It would be nice to optimize this for speed, BUT |
||
212 | // only matching the naughty elements here results in |
||
213 | // false positives and in turn - vulnerabilities! |
||
214 | do { |
||
215 | $oldString = $string; |
||
216 | $string = preg_replace_callback($pattern, [self::class, 'sanitizeNaughtyHTML'], $string); |
||
217 | } while ($oldString !== $string); |
||
218 | |||
219 | unset($oldString); |
||
220 | |||
221 | /* |
||
222 | * Sanitize naughty scripting elements |
||
223 | * |
||
224 | * Similar to above, only instead of looking for |
||
225 | * tags it looks for PHP and JavaScript commands |
||
226 | * that are disallowed. Rather than removing the |
||
227 | * code, it simply converts the parenthesis to entities |
||
228 | * rendering the code un-executable. |
||
229 | * |
||
230 | * For example: eval('some code') |
||
231 | * Becomes: eval('some code') |
||
232 | */ |
||
233 | $string = preg_replace( |
||
234 | '#(alert|prompt|confirm|cmd|passthru|eval|exec|expression|system|fopen|fsockopen|file|file_get_contents|readfile|unlink)(\s*)\((.*?)\)#si', |
||
235 | '\\1\\2(\\3)', |
||
236 | $string |
||
237 | ); |
||
238 | |||
239 | // Final clean up |
||
240 | // This adds a bit of extra precaution in case |
||
241 | // something got through the above filters |
||
242 | $string = self::doNeverAllowed($string); |
||
243 | |||
244 | /* |
||
245 | * Images are Handled in a Special Way |
||
246 | * - Essentially, we want to know that after all of the character |
||
247 | * conversion is done whether any unwanted, likely XSS, code was found. |
||
248 | * If not, we return TRUE, as the image is clean. |
||
249 | * However, if the string post-conversion does not matched the |
||
250 | * string post-removal of XSS, then it fails, as there was unwanted XSS |
||
251 | * code found and removed/changed during processing. |
||
252 | */ |
||
253 | if ($isImage === true) { |
||
254 | return ($string === $convertedString); |
||
255 | } |
||
256 | |||
257 | return $string; |
||
258 | } |
||
594 | } |