Filter::isUtf8()   A
last analyzed

Complexity

Conditions 2
Paths 2

Size

Total Lines 8
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 8
rs 9.4285
c 0
b 0
f 0
cc 2
eloc 4
nc 2
nop 1
1
<?php
2
3
/**
4
 * @package XSS filter
5
 * @author Iurii Makukh <[email protected]>
6
 * @copyright Copyright (c) 2018, Iurii Makukh
7
 * @license https://www.gnu.org/licenses/gpl.html GNU/GPLv3
8
 */
9
10
namespace gplcart\modules\xss\helpers;
11
12
/**
13
 * XSS filter. Based on Drupal's XSS filter
14
 */
15
class Filter
16
{
17
18
    /**
19
     * An array of allowed protocols
20
     * @var array
21
     */
22
    protected $protocols;
23
24
    /**
25
     * An array of allowed tags
26
     * @var array
27
     */
28
    protected $tags;
29
30
    /**
31
     * Constructor
32
     */
33
    public function __construct()
34
    {
35
        $this->tags = array();
36
        $this->protocols = array();
37
    }
38
39
    /**
40
     * Set allowed protocols
41
     * @param array $protocols
42
     * @return $this
43
     */
44
    public function setProtocols(array $protocols)
45
    {
46
        $this->protocols = $protocols;
47
        return $this;
48
    }
49
50
    /**
51
     * Set allowed tags
52
     * @param array $tags
53
     * @return $this
54
     */
55
    public function setTags(array $tags)
56
    {
57
        $this->tags = $tags;
58
        return $this;
59
    }
60
61
    /**
62
     * Filter a string
63
     * @param string $string
64
     * @return string
65
     */
66
    public function filter($string)
67
    {
68
        // Only operate on valid UTF-8 strings. This is necessary to prevent cross
69
        // site scripting issues on Internet Explorer 6.
70
        if (!$this->isUtf8($string)) {
71
            return '';
72
        }
73
74
        // Remove NULL characters (ignored by some browsers).
75
        $string = str_replace(chr(0), '', $string);
76
77
        // Remove Netscape 4 JS entities.
78
        $string = preg_replace('%&\s*\{[^}]*(\}\s*;?|$)%', '', $string);
79
80
        // Defuse all HTML entities.
81
        $string = str_replace('&', '&amp;', $string);
82
83
        // Change back only well-formed entities in our whitelist:
84
        // Decimal numeric entities.
85
        $string = preg_replace('/&amp;#([0-9]+;)/', '&#\1', $string);
86
87
        // Hexadecimal numeric entities.
88
        $string = preg_replace('/&amp;#[Xx]0*((?:[0-9A-Fa-f]{2})+;)/', '&#x\1', $string);
89
90
        // Named entities.
91
        $string = preg_replace('/&amp;([A-Za-z][A-Za-z0-9]*;)/', '&\1', $string);
92
93
        return (string) preg_replace_callback('%
94
            (
95
            <(?=[^a-zA-Z!/])  # a lone <
96
            |                 # or
97
            <!--.*?-->        # a comment
98
            |                 # or
99
            <[^>]*(>|$)       # a string that starts with a <, up until the > or the end of the string
100
            |                 # or
101
            >                 # just a >
102
            )%x', array($this, 'build'), $string);
103
    }
104
105
    /**
106
     * Whether a string is valid UTF-8
107
     * @param string $string
108
     * @return boolean
109
     */
110
    protected function isUtf8($string)
111
    {
112
        if (strlen($string) == 0) {
113
            return true;
114
        }
115
116
        return preg_match('/^./us', $string) === 1;
117
    }
118
119
    /**
120
     * Build a filtered tag
121
     * @param array $m
122
     * @return string
123
     */
124
    protected function build($m)
125
    {
126
        $string = $m[1];
127
128
        if (substr($string, 0, 1) != '<') {
129
            // We matched a lone ">" character.
130
            return '&gt;';
131
        } elseif (strlen($string) == 1) {
132
            // We matched a lone "<" character.
133
            return '&lt;';
134
        }
135
136
        if (!preg_match('%^<\s*(/\s*)?([a-zA-Z0-9\-]+)([^>]*)>?|(<!--.*?-->)$%', $string, $matches)) {
137
            // Seriously malformed.
138
            return '';
139
        }
140
141
        $slash = trim($matches[1]);
142
        $elem = &$matches[2];
143
        $attrlist = &$matches[3];
144
        $comment = &$matches[4];
145
146
        if ($comment) {
147
            $elem = '!--';
148
        }
149
150
        if (!in_array(strtolower($elem), $this->tags, true)) {
151
            return ''; // Disallowed HTML element.
152
        }
153
154
        if ($comment) {
155
            return $comment;
156
        }
157
158
        if ($slash != '') {
159
            return "</$elem>";
160
        }
161
162
        // Is there a closing XHTML slash at the end of the attributes?
163
        $attrlist = preg_replace('%(\s?)/\s*$%', '\1', $attrlist, -1, $count);
164
        $xhtml_slash = $count ? ' /' : '';
165
166
        // Clean up attributes.
167
        $attr2 = implode(' ', $this->explodeAttributes($attrlist));
168
        $attr2 = preg_replace('/[<>]/', '', $attr2);
169
        $attr2 = strlen($attr2) ? ' ' . $attr2 : '';
170
171
        return "<$elem$attr2$xhtml_slash>";
172
    }
173
174
    /**
175
     * Returns an array of filtered attributes
176
     * @param string $attr
177
     * @return array
178
     */
179
    protected function explodeAttributes($attr)
180
    {
181
        $attrarr = array();
182
        $mode = 0;
183
        $attrname = '';
184
        $skip = false;
185
186
        while (strlen($attr) != 0) {
187
188
            $working = 0;
189
190
            switch ($mode) {
191
                case 0:
192
                    // Attribute name, href for instance.
193
                    if (preg_match('/^([-a-zA-Z]+)/', $attr, $match)) {
194
                        $attrname = strtolower($match[1]);
195
                        $skip = ($attrname == 'style' || substr($attrname, 0, 2) == 'on');
196
                        $working = $mode = 1;
197
                        $attr = preg_replace('/^[-a-zA-Z]+/', '', $attr);
198
                    }
199
                    break;
200
                case 1:
201
                    // Equals sign or valueless ("selected").
202
                    if (preg_match('/^\s*=\s*/', $attr)) {
203
                        $working = 1;
204
                        $mode = 2;
205
                        $attr = preg_replace('/^\s*=\s*/', '', $attr);
206
                        break;
207
                    }
208
209
                    if (preg_match('/^\s+/', $attr)) {
210
                        $working = 1;
211
                        $mode = 0;
212
213
                        if (!$skip) {
214
                            $attrarr[] = $attrname;
215
                        }
216
217
                        $attr = preg_replace('/^\s+/', '', $attr);
218
                    }
219
                    break;
220
                case 2:
221
                    // Attribute value, a URL after href= for instance.
222
                    if (preg_match('/^"([^"]*)"(\s+|$)/', $attr, $match)) {
223
                        $thisval = $this->badProtocol($match[1]);
224
225
                        if (!$skip) {
226
                            $attrarr[] = "$attrname=\"$thisval\"";
227
                        }
228
229
                        $working = 1;
230
                        $mode = 0;
231
                        $attr = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr);
232
                        break;
233
                    }
234
235
                    if (preg_match("/^'([^']*)'(\s+|$)/", $attr, $match)) {
236
237
                        $thisval = $this->badProtocol($match[1]);
238
239
                        if (!$skip) {
240
                            $attrarr[] = "$attrname='$thisval'";
241
                        }
242
243
                        $working = 1;
244
                        $mode = 0;
245
                        $attr = preg_replace("/^'[^']*'(\s+|$)/", '', $attr);
246
                        break;
247
                    }
248
249
                    if (preg_match("%^([^\s\"']+)(\s+|$)%", $attr, $match)) {
250
                        $thisval = $this->badProtocol($match[1]);
251
252
                        if (!$skip) {
253
                            $attrarr[] = "$attrname=\"$thisval\"";
254
                        }
255
256
                        $working = 1;
257
                        $mode = 0;
258
                        $attr = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attr);
259
                    }
260
                    break;
261
            }
262
263
            if ($working == 0) {
264
                // Not well formed; remove and try again.
265
                $attr = preg_replace('/
266
                ^
267
                (
268
                "[^"]*("|$)     # - a string that starts with a double quote, up until the next double quote or the end of the string
269
                |               # or
270
                \'[^\']*(\'|$)| # - a string that starts with a quote, up until the next quote or the end of the string
271
                |               # or
272
                \S              # - a non-whitespace character
273
                )*              # any number of the above three
274
                \s*             # any number of whitespaces
275
                /x', '', $attr);
276
277
                $mode = 0;
278
            }
279
        }
280
281
        // The attribute list ends with a valueless attribute like "selected".
282
        if ($mode == 1 && !$skip) {
283
            $attrarr[] = $attrname;
284
        }
285
286
        return $attrarr;
287
    }
288
289
    /**
290
     * @param string $string
291
     * @return string
292
     */
293
    protected function badProtocol($string)
294
    {
295
        $decoded = html_entity_decode($string, ENT_QUOTES, 'UTF-8');
296
        return htmlspecialchars($this->stripDangerousProtocols($decoded), ENT_QUOTES, 'UTF-8');
297
    }
298
299
    /**
300
     * @param string $uri
301
     * @return string
302
     */
303
    protected function stripDangerousProtocols($uri)
304
    {
305
        // Iteratively remove any invalid protocol found.
306
        do {
307
            $before = $uri;
308
            $colonpos = strpos($uri, ':');
309
310
            if ($colonpos > 0) {
311
                // We found a colon, possibly a protocol. Verify.
312
                $protocol = substr($uri, 0, $colonpos);
313
314
                // If a colon is preceded by a slash, question mark or hash, it cannot
315
                // possibly be part of the URL scheme. This must be a relative URL, which
316
                // inherits the (safe) protocol of the base document.
317
                if (preg_match('![/?#]!', $protocol)) {
318
                    break;
319
                }
320
321
                // Check if this is a disallowed protocol. Per RFC2616, section 3.2.3
322
                // (URI Comparison) scheme comparison must be case-insensitive.
323
                if (!in_array(strtolower($protocol), $this->protocols, true)) {
324
                    $uri = substr($uri, $colonpos + 1);
325
                }
326
            }
327
        } while ($before != $uri);
328
329
        return $uri;
330
    }
331
332
}
333