Passed
Push — develop ( 089cb5...597d05 )
by nguereza
05:42
created

InputClean::htmlEntityDecode()   B

Complexity

Conditions 8
Paths 9

Size

Total Lines 51
Code Lines 29

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 8
eloc 29
c 1
b 0
f 0
nc 9
nop 2
dl 0
loc 51
rs 8.2114

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * Platine Framework
5
 *
6
 * Platine Framework is a lightweight, high-performance, simple and elegant
7
 * PHP Web framework
8
 *
9
 * This content is released under the MIT License (MIT)
10
 *
11
 * Copyright (c) 2020 Platine Framework
12
 *
13
 * Permission is hereby granted, free of charge, to any person obtaining a copy
14
 * of this software and associated documentation files (the "Software"), to deal
15
 * in the Software without restriction, including without limitation the rights
16
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
 * copies of the Software, and to permit persons to whom the Software is
18
 * furnished to do so, subject to the following conditions:
19
 *
20
 * The above copyright notice and this permission notice shall be included in all
21
 * copies or substantial portions of the Software.
22
 *
23
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
 * SOFTWARE.
30
 */
31
32
/**
33
 *  @file InputClean.php
34
 *
35
 *  This class apply the clean (XSS, sanitize, ...) the request data
36
 *
37
 *  @package    Platine\Framework\Http
38
 *  @author Platine Developers team
39
 *  @copyright  Copyright (c) 2020
40
 *  @license    http://opensource.org/licenses/MIT  MIT License
41
 *  @link   https://www.platine-php.com
42
 *  @version 1.0.0
43
 *  @filesource
44
 */
45
46
declare(strict_types=1);
47
48
namespace Platine\Framework\Http;
49
50
use Platine\Stdlib\Helper\Str;
51
52
/**
53
 * @class InputClean
54
 * @package Platine\Framework\Http
55
 */
56
class InputClean
57
{
58
    /**
59
     * The list of invalid filename chars
60
     * @var array<string>
61
     */
62
    protected array $invalidFilenameChars = [
63
        '../', '<!--', '-->', '<', '>',
64
        '\'', '"', '&', '$', '#',
65
        '{', '}', '[', ']', '=',
66
        ';', '?', '%20', '%22',
67
        '%3c',      // <
68
        '%253c',    // <
69
        '%3e',      // >
70
        '%0e',      // >
71
        '%28',      // (
72
        '%29',      // )
73
        '%2528',    // (
74
        '%26',      // &
75
        '%24',      // $
76
        '%3f',      // ?
77
        '%3b',      // ;
78
        '%3d'       // =
79
    ];
80
81
    /**
82
     * The character set to use
83
     * @var string
84
     */
85
    protected string $charset = 'UTF-8';
86
87
    /**
88
     * The random generated XSS hash to protect URL
89
     * @var string
90
     */
91
    protected string $xssHash = '';
92
93
    /**
94
     * The list of forbidden strings
95
     * @var array<string, string>
96
     */
97
    protected array $forbiddenStrings = [
98
        'document.cookie' => '[removed]',
99
        'document.write'  => '[removed]',
100
        '.parentNode'     => '[removed]',
101
        '.innerHTML'      => '[removed]',
102
        '-moz-binding'    => '[removed]',
103
        '<!--'            => '&lt;!--',
104
        '-->'             => '--&gt;',
105
        '<![CDATA['       => '&lt;![CDATA[',
106
        '<comment>'   => '&lt;comment&gt;',
107
        '<%'              => '&lt;&#37;'
108
    ];
109
110
    /**
111
     * The list of forbidden strings patterns
112
     * @var array<string>
113
     */
114
    protected array $forbiddenStringPatterns = [
115
        'javascript\s*:',
116
        '(document|(document\.)?window)\.(location|on\w*)',
117
        'expression\s*(\(|&\#40;)', // CSS and IE
118
        'vbscript\s*:', // IE, surprise!
119
        'wscript\s*:', // IE
120
        'jscript\s*:', // IE
121
        'vbs\s*:', // IE
122
        'Redirect\s+30\d',
123
        "([\"'])?data\s*:[^\\1]*?base64[^\\1]*?,[^\\1]*?\\1?"
124
    ];
125
126
    /**
127
     * Create new instance
128
     * @param string $charset
129
     */
130
    public function __construct(string $charset = 'UTF-8')
131
    {
132
        $this->charset = $charset;
133
    }
134
135
        /**
136
     * The main function to clean input
137
     * @param mixed $str
138
     * @param bool $isImage
139
     * @return mixed
140
     */
141
    public function clean($str, bool $isImage = false)
142
    {
143
        if (is_array($str)) {
144
            foreach ($str as $key => &$value) {
145
                $str[$key] = $this->clean($value);
146
            }
147
148
            return $str;
149
        }
150
151
        if ($str === '' || $str === null || is_bool($str) || ! $str || is_numeric($str)) {
152
            return $str;
153
        }
154
155
        // Remove Invisible Characters
156
        $str = $this->removeInvisibleCharacters($str);
157
158
        // URL Decode
159
        // Just in case stuff like this is submitted:
160
        // <a href="http://%77%77%77%2E%67%6F%6F%67%6C%65%2E%63%6F%6D">Google</a>
161
        // Note: Use rawurldecode() so it does not remove plus signs
162
        if (stripos($str, '%') !== false) {
163
            do {
164
                $oldStr = $str;
165
                $rawStr = rawurldecode($str);
166
                $str = (string) preg_replace_callback(
167
                    '#%(?:\s*[0-9a-f]){2,}#i',
168
                    [$this, 'urlDecodeSpaces'],
169
                    $rawStr
170
                );
171
            } while ($oldStr !== $str);
172
            unset($oldStr);
173
        }
174
175
        /*
176
         * Convert character entities to ASCII
177
         *
178
         * This permits our tests below to work reliably.
179
         * We only convert entities that are within tags since
180
         * these are the ones that will pose security problems.
181
         */
182
        $str = (string) preg_replace_callback(
183
            "/[^a-z0-9>]+[a-z0-9]+=([\'\"]).*?\\1/si",
184
            [$this, 'convertAttribute'],
185
            $str
186
        );
187
188
        $str = (string) preg_replace_callback(
189
            '/<\w+.*/si',
190
            [$this, 'decodeEntity'],
191
            $str
192
        );
193
194
        // Remove Invisible Characters Again!
195
        $str = $this->removeInvisibleCharacters($str);
196
197
        /*
198
         * Convert all tabs to spaces
199
         *
200
         * This prevents strings like this: ja  vascript
201
         * NOTE: we deal with spaces between characters later.
202
         * NOTE: preg_replace was found to be amazingly slow here on
203
         * large blocks of data, so we use str_replace.
204
         */
205
        $str = str_replace("\t", ' ', $str);
206
207
        // Capture converted string for later comparison
208
        $convertedString = $str;
209
210
        // Remove Strings that are never allowed
211
        $str = $this->removeForbiddenStrings($str);
212
213
        /*
214
         * Makes PHP tags safe
215
         * Note: XML tags are inadvertently replaced too:
216
         * <?xml
217
         *
218
         * But it doesn't seem to pose a problem.
219
         */
220
        if ($isImage) {
221
            // Images have a tendency to have the PHP short opening and
222
            // closing tags every so often so we skip those and only
223
            // do the long opening tags.
224
            $str = (string) preg_replace(
225
                '/<\?(php)/i',
226
                '&lt;?\\1',
227
                $str
228
            );
229
        } else {
230
            $str = str_replace(
231
                ['<?', '?' . '>'],
232
                ['&lt;?', '?&gt;'],
233
                $str
234
            );
235
        }
236
237
        /*
238
         * Compact any exploded words
239
         *
240
         * This corrects words like:  j a v a s c r i p t
241
         * These words are compacted back to their correct state.
242
         */
243
        $words = [
244
            'javascript', 'expression', 'vbscript', 'jscript', 'wscript',
245
            'vbs', 'script', 'base64', 'applet', 'alert', 'document',
246
            'write', 'cookie', 'window', 'confirm', 'prompt', 'eval'
247
        ];
248
249
        foreach ($words as $word) {
250
            $word = implode('\s*', str_split($word)) . '\s*';
0 ignored issues
show
Bug introduced by
It seems like str_split($word) can also be of type true; however, parameter $pieces of implode() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

250
            $word = implode('\s*', /** @scrutinizer ignore-type */ str_split($word)) . '\s*';
Loading history...
251
252
            // We only want to do this when it is followed by a non-word character
253
            // That way valid stuff like "dealer to" does not become "dealerto"
254
            $str = (string) preg_replace_callback(
255
                '#(' . substr($word, 0, -3) . ')(\W)#is',
256
                [$this, 'compactExplodedWords'],
257
                $str
258
            );
259
        }
260
261
        /*
262
         * Remove disallowed Javascript in links or img tags
263
         * We used to do some version comparisons and use of stripos(),
264
         * but it is dog slow compared to these simplified non-capturing
265
         * preg_match(), especially if the pattern exists in the string
266
         *
267
         * Note: It was reported that not only space characters, but all in
268
         * the following pattern can be parsed as separators between a tag name
269
         * and its attributes: [\d\s"\'`;,\/\=\(\x00\x0B\x09\x0C]
270
         * ... however, remove invisible characters above already strips the
271
         * hex-encoded ones, so we'll skip them below.
272
         */
273
        do {
274
            $original = $str;
275
276
            if (preg_match('/<a/i', $str)) {
277
                $str = (string) preg_replace_callback(
278
                    '#<a(?:rea)?[^a-z0-9>]+([^>]*?)(?:>|$)#si',
279
                    [$this, 'removeJsLink'],
280
                    $str
281
                );
282
            }
283
284
            if (preg_match('/<img/i', $str)) {
285
                $str = (string) preg_replace_callback(
286
                    '#<img[^a-z0-9]+([^>]*?)(?:\s?/?>|$)#si',
287
                    [$this, 'removeJsImage'],
288
                    $str
289
                );
290
            }
291
292
            if (preg_match('/script|xss/i', $str)) {
293
                $str = (string) preg_replace(
294
                    '#</*(?:script|xss).*?>#si',
295
                    '[removed]',
296
                    $str
297
                );
298
            }
299
        } while ($original !== $str);
300
        unset($original);
301
302
        /*
303
         * Sanitize naughty HTML elements
304
         *
305
         * If a tag containing any of the words in the list
306
         * below is found, the tag gets converted to entities.
307
         *
308
         * So this: <blink>
309
         * Becomes: &lt;blink&gt;
310
         */
311
        $pattern = '#'
312
            . '<((?<slash>/*\s*)((?<tagName>[a-z0-9]+)(?=[^a-z0-9]|$)|.+)' // tag
313
            // start and name, followed by a non-tag character
314
            . '[^\s\042\047a-z0-9>/=]*' // a valid attribute character
315
            // immediately after the tag would count as a separator
316
            // optional attributes
317
            . '(?<attributes>(?:[\s\042\047/=]*' // non-attribute characters,
318
            // excluding > (tag close) for obvious reasons
319
            . '[^\s\042\047>/=]+' // attribute characters
320
            // optional attribute-value
321
                . '(?:\s*=' // attribute-value separator
322
                    . '(?:[^\s\042\047=><`]+|\s*\042[^\042]*\042|\s*\047[^\047]'
323
                . '*\047|\s*(?U:[^\s\042\047=><`]*))' // single, double or non-quoted value
324
                . ')?' // end optional attribute-value group
325
            . ')*)' // end optional attributes group
326
            . '[^>]*)(?<closeTag>\>)?#isS';
327
328
        // Note: It would be nice to optimize this for speed, BUT
329
        // only matching the naughty elements here results in
330
        // false positives and in turn - vulnerabilities!
331
        do {
332
            $oldStr = $str;
333
            $str = (string) preg_replace_callback(
334
                $pattern,
335
                [$this, 'sanitizeNaughtyHtml'],
336
                $str
337
            );
338
        } while ($oldStr !== $str);
339
        unset($oldStr);
340
341
        /*
342
         * Sanitize naughty scripting elements
343
         *
344
         * Similar to above, only instead of looking for
345
         * tags it looks for PHP and JavaScript commands
346
         * that are disallowed. Rather than removing the
347
         * code, it simply converts the parenthesis to entities
348
         * rendering the code un-executable.
349
         *
350
         * For example: eval('some code')
351
         * Becomes: eval&#40;'some code'&#41;
352
         */
353
        $str = (string) preg_replace(
354
            '#(alert|prompt|confirm|cmd|passthru|eval|exec|expression|system|'
355
                . 'fopen|fsockopen|file|file_get_contents|readfile|unlink)(\s*)\((.*?)\)#si',
356
            '\\1\\2&#40;\\3&#41;',
357
            $str
358
        );
359
360
        // Final clean up
361
        // This adds a bit of extra precaution in case
362
        // something got through the above filters
363
        $str = $this->removeForbiddenStrings($str);
364
365
        /*
366
         * Images are Handled in a Special Way
367
         * - Essentially, we want to know that after all of the character
368
         * conversion is done whether any unwanted, likely XSS, code was found.
369
         * If not, we return TRUE, as the image is clean.
370
         * However, if the string post-conversion does not matched the
371
         * string post-removal of XSS, then it fails, as there was unwanted XSS
372
         * code found and removed/changed during processing.
373
         */
374
        if ($isImage) {
375
            return ($str === $convertedString);
376
        }
377
378
        return $str;
379
    }
380
381
    /**
382
     * Generate the XSS hash if not yet generated
383
     * and return it
384
     * @return string
385
     */
386
    public function getXssHash(): string
387
    {
388
        if (empty($this->xssHash)) {
389
            $this->xssHash = Str::random(16);
390
        }
391
392
        return $this->xssHash;
393
    }
394
395
396
    /**
397
     * Return the character set
398
     * @return string
399
     */
400
    public function getCharset(): string
401
    {
402
        return $this->charset;
403
    }
404
405
    /**
406
     * Set the character set
407
     * @param string $charset
408
     * @return $this
409
     */
410
    public function setCharset(string $charset): self
411
    {
412
        $this->charset = $charset;
413
        return $this;
414
    }
415
416
    /**
417
     * Sanitize the filename
418
     * @param string $str
419
     * @param bool $isRelativePath whether to preserve path
420
     * @return string
421
     */
422
    public function sanitizeFilename(string $str, bool $isRelativePath = false): string
423
    {
424
        $invalids = $this->invalidFilenameChars;
425
        if ($isRelativePath === false) {
426
            $invalids[] = './';
427
            $invalids[] = '/';
428
        }
429
430
        $cleanStr = $this->removeInvisibleCharacters($str, false);
431
        do {
432
            $old = $cleanStr;
433
            $cleanStr = str_replace($invalids, '', $cleanStr);
434
        } while ($old !== $cleanStr);
435
436
        return stripslashes($cleanStr);
437
    }
438
439
    /**
440
     * Remove the "img" tags
441
     * @param string $str
442
     * @return string
443
     */
444
    public function stripImageTags(string $str): string
445
    {
446
        return (string) preg_replace(
447
            [
448
                '#<img[\s/]+.*?src\s*=\s*(["\'])([^\\1]+?)\\1.*?\>#i',
449
            '#<img[\s/]+.*?src\s*=\s*?(([^\s"\'=<>`]+)).*?\>#i',
450
            ],
451
            '\\2',
452
            $str
453
        );
454
    }
455
456
    /**
457
     * HTML Entities Decode
458
     * A replacement for html_entity_decode()
459
     *
460
     * The reason we are not using html_entity_decode() by itself is because
461
     * while it is not technically correct to leave out the semicolon
462
     * at the end of an entity most browsers will still interpret the entity
463
     * correctly. html_entity_decode() does not convert entities without
464
     * semicolons, so we are left with our own little solution here. Bummer.
465
     *
466
     * @param string $str
467
     * @param string|null $charset the custom character set if not will use the current one
468
     * @return string
469
     */
470
    protected function htmlEntityDecode(string $str, ?string $charset = null): string
471
    {
472
        if (strpos($str, '&') === false) {
473
            return $str;
474
        }
475
476
        static $entities;
477
478
        if ($charset === null) {
479
            $charset = $this->charset;
480
        }
481
482
        $flag = ENT_COMPAT | ENT_HTML5;
483
484
        if (! isset($entities)) {
485
            $entities = array_map(
486
                'strtolower',
487
                get_html_translation_table(HTML_ENTITIES, $flag, $charset)
488
            );
489
        }
490
491
        do {
492
            $strCompare = $str;
493
494
            // Decode standard entities, avoiding false positives
495
            $matches = [];
496
            if (preg_match_all('/&[a-z]{2,}(?![a-z;])/i', $str, $matches) > 0) {
497
                $replace = [];
498
                $matches = array_unique(array_map('strtolower', $matches[0]));
499
                foreach ($matches as &$match) {
500
                    if (($char = array_search($match . ';', $entities, true)) !== false) {
501
                        $replace[$match] = $char;
502
                    }
503
                }
504
505
                $strReplace = str_replace(array_keys($replace), array_values($replace), $str);
506
507
                // Decode numeric & UTF16 two byte entities
508
                $str = html_entity_decode(
509
                    (string) preg_replace(
510
                        '/(&#(?:x0*[0-9a-f]{2,5}(?![0-9a-f;])|(?:0*\d{2,4}(?![0-9;]))))/iS',
511
                        '$1;',
512
                        $strReplace
513
                    ),
514
                    $flag,
515
                    $charset
516
                );
517
            }
518
        } while ($strCompare !== $str);
519
520
        return $str;
521
    }
522
523
524
525
    /**
526
     * The URL decode taking space into account
527
     * @param array<int, string> $matches
528
     * @return string
529
     */
530
    protected function urlDecodeSpaces(array $matches): string
531
    {
532
        $input = $matches[0];
533
        $noSpace = (string) preg_replace('#\s+#', '', $input);
534
535
        return $noSpace === $input
536
                ? $input
537
                : rawurldecode($noSpace);
538
    }
539
540
    /**
541
     * Compact exploded words (remove white space from string like 'j a v a s c r i p t')
542
     * @param array<int, string> $matches
543
     * @return string
544
     */
545
    protected function compactExplodedWords(array $matches): string
546
    {
547
        return (string) preg_replace('/\s+/s', '', $matches[1]) . $matches[2];
548
    }
549
550
    /**
551
     * Sanitize the string to remove naughty HTML elements
552
     * @param array<int|string, string> $matches
553
     * @return string
554
     */
555
    protected function sanitizeNaughtyHtml(array $matches): string
556
    {
557
        static $naughtyTags = [
558
            'alert', 'area', 'prompt', 'confirm', 'applet', 'audio', 'basefont',
559
            'base', 'behavior', 'bgsound', 'blink', 'body', 'embed', 'expression',
560
            'form', 'frameset', 'frame', 'head', 'html', 'ilayer','iframe', 'input',
561
            'button', 'select', 'isindex', 'layer', 'link', 'meta', 'keygen', 'object',
562
            'plaintext', 'style', 'script', 'textarea', 'title', 'math', 'video', 'svg',
563
            'xml', 'xss'
564
        ];
565
566
        static $evilAttributes = [
567
            'on\w+', 'style', 'xmlns', 'formaction',
568
            'form', 'xlink:href', 'FSCommand', 'seekSegmentTime'
569
        ];
570
571
        // First, escape unclosed tags
572
        if (empty($matches['closeTag'])) {
573
            return '&lt;' . $matches[1];
574
        } elseif (in_array(strtolower($matches['tagName']), $naughtyTags, true)) {
575
            // Is the element that we caught naughty? If so, escape it
576
            return '&lt;' . $matches[1] . '&gt;';
577
        } elseif (isset($matches['attributes'])) {
578
            // For other tags, see if their attributes are "evil" and strip those
579
            // We'll store the already fitlered attributes here
580
            $attributes = [];
581
582
            $attributesPattern = '#'
583
                    . '(?<name>[^\s\042\047>/=]+)' // attribute characters
584
                    // optional attribute-value
585
                    . '(?:\s*=(?<value>[^\s\042\047=><`]+|\s*\042[^\042]*\042|'
586
                    . '\s*\047[^\047]*\047|\s*(?U:[^\s\042\047=><`]*)))'
587
                    // attribute-value separator;
588
                    . '#i';
589
590
            // Blacklist pattern for evil attribute names
591
            $isEvilPattern = '#^(' . implode('|', $evilAttributes) . ')$#i';
592
            // Each iteration filters a single attribute
593
            do {
594
                // Strip any non-alpha characters that may precede an attribute.
595
                // Browsers often parse these incorrectly and that has been a
596
                // of numerous XSS issues we've had.
597
                $matches['attributes'] = (string) preg_replace(
598
                    '#^[^a-z]+#i',
599
                    '',
600
                    $matches['attributes']
601
                );
602
                $attribute = [];
603
                if (
604
                    ! preg_match(
605
                        $attributesPattern,
606
                        $matches['attributes'],
607
                        $attribute,
608
                        PREG_OFFSET_CAPTURE
609
                    )
610
                ) {
611
                    // No (valid) attribute found? Discard everything else inside the tag
612
                    break;
613
                }
614
615
                if (
616
                    // Is it indeed an "evil" attribute?
617
                    preg_match($isEvilPattern, $attribute['name'][0]) ||
618
                    // Or does it have an equals sign, but no value and not quoted? Strip that too!
619
                    trim($attribute['value'][0]) === ''
620
                ) {
621
                    $attributes[] = 'xss=removed';
622
                } else {
623
                    $attributes[] = $attribute[0][0];
624
                }
625
626
                $matches['attributes']  = (string) substr(
627
                    $matches['attributes'],
628
                    $attribute[0][1] + strlen($attribute[0][0])
629
                );
630
            } while ($matches['attributes'] !== '');
631
632
            $result = count($attributes) > 0
633
                    ? ''
634
                    : ' ' . implode(' ', $attributes);
635
636
            return '<' . $matches['slash'] . $matches['tagName'] . $result . '>';
637
        }
638
639
        return $matches[0];
640
    }
641
642
    /**
643
     * Remove the JS link from the string
644
     * @param array<int, string> $matches
645
     * @return string
646
     */
647
    protected function removeJsLink(array $matches): string
648
    {
649
        return str_replace(
650
            $matches[1],
651
            (string) preg_replace(
652
                '#href=.*?(?:(?:alert|prompt|confirm)(?:\(|&\#40;)|javascript:'
653
                    . '|livescript:|mocha:|charset=|window\.|document\.|\.cookie'
654
                    . '|<script|<xss|d\s*a\s*t\s*a\s*:)#si',
655
                '',
656
                $this->filterAttributes($matches[1])
657
            ),
658
            $matches[0]
659
        );
660
    }
661
662
    /**
663
     * Remove the JS from image tags
664
     * @param array<int, string> $matches
665
     * @return string
666
     */
667
    protected function removeJsImage(array $matches): string
668
    {
669
        return str_replace(
670
            $matches[1],
671
            (string) preg_replace(
672
                '#src=.*?(?:(?:alert|prompt|confirm|eval)(?:\(|&\#40;)|javascript:'
673
                    . '|livescript:|mocha:|charset=|window\.|document\.|\.cooki'
674
                    . 'e|<script|<xss|base64\s*,)#si',
675
                '',
676
                $this->filterAttributes($matches[1])
677
            ),
678
            $matches[0]
679
        );
680
    }
681
682
    /**
683
     * The HTML entities decode callback
684
     * @param array<int, string> $matches
685
     * @return string
686
     */
687
    protected function decodeEntity(array $matches): string
688
    {
689
        // Protect GET variables in URLs like 901119URL5918AMP18930PROTECT8198
690
        $str = (string) preg_replace(
691
            '|\&([a-z\_0-9\-]+)\=([a-z\_0-9\-/]+)|i',
692
            $this->getXssHash() . '\\1=\\2',
693
            $matches[0]
694
        );
695
        // Decode, then un-protect URL GET vars
696
        return str_replace(
697
            $this->getXssHash(),
698
            '&',
699
            $this->htmlEntityDecode($str, $this->charset)
700
        );
701
    }
702
703
    /**
704
     * Convert the attribute
705
     * @param array<int, mixed> $matches
706
     * @return string
707
     */
708
    protected function convertAttribute(array $matches): string
709
    {
710
        return str_replace(
711
            ['>', '<', '\\'],
712
            ['&gt;', '&lt;', '\\\\'],
713
            $matches[0]
714
        );
715
    }
716
717
    /**
718
     *  Filter tag attributes for consistency and safety.
719
     * @param string $str
720
     * @return string
721
     */
722
    protected function filterAttributes(string $str): string
723
    {
724
        $result = '';
725
        $matches = [];
726
        if (
727
            preg_match_all(
728
                '#\s*[a-z\-]+\s*=\s*(\042|\047)([^\\1]*?)\\1#is',
729
                $str,
730
                $matches
731
            ) > 0
732
        ) {
733
            foreach ($matches[0] as $match) {
734
                $result .= (string) preg_replace('#/\*.*?\*/#s', '', $match);
735
            }
736
        }
737
738
        return $result;
739
    }
740
741
    /**
742
     * Remove the forbidden strings
743
     * @param string $str
744
     * @return string
745
     */
746
    protected function removeForbiddenStrings(string $str): string
747
    {
748
        $keys = array_keys($this->forbiddenStrings);
749
        $values = array_values($this->forbiddenStrings);
750
751
        $cleanStr = str_replace($keys, $values, $str);
752
        foreach ($this->forbiddenStringPatterns as $regex) {
753
            $cleanStr = (string) preg_replace('#' . $regex . '#is', '[removed]', $cleanStr);
754
        }
755
756
        return $cleanStr;
757
    }
758
759
    /**
760
     * Remove invisible characters
761
     * This prevents sandwiching null characters
762
     * between ASCII characters, like Java\0script.
763
     *
764
     * @param string $str
765
     * @param bool $urlEncode
766
     * @return string
767
     */
768
    protected function removeInvisibleCharacters(string $str, bool $urlEncode = true): string
769
    {
770
        $nonDisplayables = [];
771
772
        /* Every control character except newline (dec 10),
773
     carriage return (dec 13) and horizontal tab (dec 09)
774
        */
775
776
        if ($urlEncode) {
777
            $nonDisplayables[] = '/%0[0-8bcef]/i';  // URL encoded 00-08, 11, 12, 14, 15
778
            $nonDisplayables[] = '/%1[0-9a-f]/i';   // URL encoded 16-31
779
            $nonDisplayables[] = '/%7f/i';      // URL encoded 127
780
        }
781
782
        $nonDisplayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127
783
        $count = 0;
784
        do {
785
            $str = (string) preg_replace($nonDisplayables, '', $str, -1, $count);
786
        } while ($count > 0);
787
788
        return $str;
789
    }
790
}
791