InputClean::clean()   F
last analyzed

Complexity

Conditions 18
Paths 131

Size

Total Lines 239
Code Lines 96

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 18
eloc 96
c 1
b 0
f 0
nc 131
nop 2
dl 0
loc 239
rs 3.7739

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
/**
4
 * Platine Framework
5
 *
6
 * Platine Framework is a lightweight, high-performance, simple and elegant
7
 * PHP Web framework
8
 *
9
 * This content is released under the MIT License (MIT)
10
 *
11
 * Copyright (c) 2020 Platine Framework
12
 *
13
 * Permission is hereby granted, free of charge, to any person obtaining a copy
14
 * of this software and associated documentation files (the "Software"), to deal
15
 * in the Software without restriction, including without limitation the rights
16
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
 * copies of the Software, and to permit persons to whom the Software is
18
 * furnished to do so, subject to the following conditions:
19
 *
20
 * The above copyright notice and this permission notice shall be included in all
21
 * copies or substantial portions of the Software.
22
 *
23
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
 * SOFTWARE.
30
 */
31
32
/**
33
 *  @file InputClean.php
34
 *
35
 *  This class apply the clean (XSS, sanitize, ...) the request data
36
 *
37
 *  @package    Platine\Framework\Http
38
 *  @author Platine Developers team
39
 *  @copyright  Copyright (c) 2020
40
 *  @license    http://opensource.org/licenses/MIT  MIT License
41
 *  @link   https://www.platine-php.com
42
 *  @version 1.0.0
43
 *  @filesource
44
 */
45
46
declare(strict_types=1);
47
48
namespace Platine\Framework\Http;
49
50
use Platine\Stdlib\Helper\Str;
51
52
/**
53
 * @class InputClean
54
 * @package Platine\Framework\Http
55
 */
56
class InputClean
57
{
58
    /**
59
     * The list of invalid filename chars
60
     * @var array<string>
61
     */
62
    protected array $invalidFilenameChars = [
63
        '../', '<!--', '-->', '<', '>',
64
        '\'', '"', '&', '$', '#',
65
        '{', '}', '[', ']', '=',
66
        ';', '?', '%20', '%22',
67
        '%3c',      // <
68
        '%253c',    // <
69
        '%3e',      // >
70
        '%0e',      // >
71
        '%28',      // (
72
        '%29',      // )
73
        '%2528',    // (
74
        '%26',      // &
75
        '%24',      // $
76
        '%3f',      // ?
77
        '%3b',      // ;
78
        '%3d'       // =
79
    ];
80
81
    /**
82
     * The character set to use
83
     * @var string
84
     */
85
    protected string $charset = 'UTF-8';
86
87
    /**
88
     * The random generated XSS hash to protect URL
89
     * @var string
90
     */
91
    protected string $xssHash = '';
92
93
    /**
94
     * The list of forbidden strings
95
     * @var array<string, string>
96
     */
97
    protected array $forbiddenStrings = [
98
        'document.cookie' => '[removed]',
99
        'document.write'  => '[removed]',
100
        '.parentNode'     => '[removed]',
101
        '.innerHTML'      => '[removed]',
102
        '-moz-binding'    => '[removed]',
103
        '<!--'            => '&lt;!--',
104
        '-->'             => '--&gt;',
105
        '<![CDATA['       => '&lt;![CDATA[',
106
        '<comment>'   => '&lt;comment&gt;',
107
        '<%'              => '&lt;&#37;'
108
    ];
109
110
    /**
111
     * The list of forbidden strings patterns
112
     * @var array<string>
113
     */
114
    protected array $forbiddenStringPatterns = [
115
        'javascript\s*:',
116
        '(document|(document\.)?window)\.(location|on\w*)',
117
        'expression\s*(\(|&\#40;)', // CSS and IE
118
        'vbscript\s*:', // IE, surprise!
119
        'wscript\s*:', // IE
120
        'jscript\s*:', // IE
121
        'vbs\s*:', // IE
122
        'Redirect\s+30\d',
123
        "([\"'])?data\s*:[^\\1]*?base64[^\\1]*?,[^\\1]*?\\1?"
124
    ];
125
126
    /**
127
     * Create new instance
128
     * @param string $charset
129
     */
130
    public function __construct(string $charset = 'UTF-8')
131
    {
132
        $this->charset = $charset;
133
    }
134
135
        /**
136
     * The main function to clean input
137
     * @param mixed $str
138
     * @param bool $isImage
139
     * @return mixed
140
     */
141
    public function clean(mixed $str, bool $isImage = false): mixed
142
    {
143
        if (is_array($str)) {
144
            foreach ($str as $key => &$value) {
145
                $str[$key] = $this->clean($value);
146
            }
147
148
            return $str;
149
        }
150
151
        if ($str === '' || $str === null || is_bool($str) || ! $str || is_numeric($str)) {
152
            return $str;
153
        }
154
155
        // Remove Invisible Characters
156
        $str = $this->removeInvisibleCharacters($str);
157
158
        // URL Decode
159
        // Just in case stuff like this is submitted:
160
        // <a href="http://%77%77%77%2E%67%6F%6F%67%6C%65%2E%63%6F%6D">Google</a>
161
        // Note: Use rawurldecode() so it does not remove plus signs
162
        if (stripos($str, '%') !== false) {
163
            do {
164
                $oldStr = $str;
165
                $rawStr = rawurldecode($str);
166
                $str = (string) preg_replace_callback(
167
                    '#%(?:\s*[0-9a-f]){2,}#i',
168
                    [$this, 'urlDecodeSpaces'],
169
                    $rawStr
170
                );
171
            } while ($oldStr !== $str);
172
            unset($oldStr);
173
        }
174
175
        /*
176
         * Convert character entities to ASCII
177
         *
178
         * This permits our tests below to work reliably.
179
         * We only convert entities that are within tags since
180
         * these are the ones that will pose security problems.
181
         */
182
        $str = (string) preg_replace_callback(
183
            "/[^a-z0-9>]+[a-z0-9]+=([\'\"]).*?\\1/si",
184
            [$this, 'convertAttribute'],
185
            $str
186
        );
187
188
        $str = (string) preg_replace_callback(
189
            '/<\w+.*/si',
190
            [$this, 'decodeEntity'],
191
            $str
192
        );
193
194
        // Remove Invisible Characters Again!
195
        $str = $this->removeInvisibleCharacters($str);
196
197
        /*
198
         * Convert all tabs to spaces
199
         *
200
         * This prevents strings like this: ja  vascript
201
         * NOTE: we deal with spaces between characters later.
202
         * NOTE: preg_replace was found to be amazingly slow here on
203
         * large blocks of data, so we use str_replace.
204
         */
205
        $str = str_replace("\t", ' ', $str);
206
207
        // Capture converted string for later comparison
208
        $convertedString = $str;
209
210
        // Remove Strings that are never allowed
211
        $str = $this->removeForbiddenStrings($str);
212
213
        /*
214
         * Makes PHP tags safe
215
         * Note: XML tags are inadvertently replaced too:
216
         * <?xml
217
         *
218
         * But it doesn't seem to pose a problem.
219
         */
220
        if ($isImage) {
221
            // Images have a tendency to have the PHP short opening and
222
            // closing tags every so often so we skip those and only
223
            // do the long opening tags.
224
            $str = (string) preg_replace(
225
                '/<\?(php)/i',
226
                '&lt;?\\1',
227
                $str
228
            );
229
        } else {
230
            $str = str_replace(
231
                ['<?', '?' . '>'],
232
                ['&lt;?', '?&gt;'],
233
                $str
234
            );
235
        }
236
237
        /*
238
         * Compact any exploded words
239
         *
240
         * This corrects words like:  j a v a s c r i p t
241
         * These words are compacted back to their correct state.
242
         */
243
        $words = [
244
            'javascript', 'expression', 'vbscript', 'jscript', 'wscript',
245
            'vbs', 'script', 'base64', 'applet', 'alert', 'document',
246
            'write', 'cookie', 'window', 'confirm', 'prompt', 'eval'
247
        ];
248
249
        foreach ($words as $word) {
250
            $word = implode('\s*', str_split($word)) . '\s*';
0 ignored issues
show
Bug introduced by
It seems like str_split($word) can also be of type true; however, parameter $pieces of implode() does only seem to accept array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

250
            $word = implode('\s*', /** @scrutinizer ignore-type */ str_split($word)) . '\s*';
Loading history...
251
252
            // We only want to do this when it is followed by a non-word character
253
            // That way valid stuff like "dealer to" does not become "dealerto"
254
            $str = (string) preg_replace_callback(
255
                '#(' . substr($word, 0, -3) . ')(\W)#is',
256
                [$this, 'compactExplodedWords'],
257
                $str
258
            );
259
        }
260
261
        /*
262
         * Remove disallowed Javascript in links or img tags
263
         * We used to do some version comparisons and use of stripos(),
264
         * but it is dog slow compared to these simplified non-capturing
265
         * preg_match(), especially if the pattern exists in the string
266
         *
267
         * Note: It was reported that not only space characters, but all in
268
         * the following pattern can be parsed as separators between a tag name
269
         * and its attributes: [\d\s"\'`;,\/\=\(\x00\x0B\x09\x0C]
270
         * ... however, remove invisible characters above already strips the
271
         * hex-encoded ones, so we'll skip them below.
272
         */
273
        do {
274
            $original = $str;
275
276
            if (preg_match('/<a/i', $str)) {
277
                $str = (string) preg_replace_callback(
278
                    '#<a(?:rea)?[^a-z0-9>]+([^>]*?)(?:>|$)#si',
279
                    [$this, 'removeJsLink'],
280
                    $str
281
                );
282
            }
283
284
            if (preg_match('/<img/i', $str)) {
285
                $str = (string) preg_replace_callback(
286
                    '#<img[^a-z0-9]+([^>]*?)(?:\s?/?>|$)#si',
287
                    [$this, 'removeJsImage'],
288
                    $str
289
                );
290
            }
291
292
            if (preg_match('/script|xss/i', $str)) {
293
                $str = (string) preg_replace(
294
                    '#</*(?:script|xss).*?>#si',
295
                    '[removed]',
296
                    $str
297
                );
298
            }
299
        } while ($original !== $str);
300
        unset($original);
301
302
        /*
303
         * Sanitize naughty HTML elements
304
         *
305
         * If a tag containing any of the words in the list
306
         * below is found, the tag gets converted to entities.
307
         *
308
         * So this: <blink>
309
         * Becomes: &lt;blink&gt;
310
         */
311
312
        $pattern = '#'
313
            . '<((?<slash>/*\s*)((?<tagName>[a-z0-9]+)(?=[^a-z0-9]|$)|.+)' // tag
314
            // start and name, followed by a non-tag character
315
            . '[^\s\042\047a-z0-9>/=]*' // a valid attribute character
316
            // immediately after the tag would count as a separator
317
            // optional attributes
318
            . '(?<attributes>(?:[\s\042\047/=]*' // non-attribute characters,
319
            // excluding > (tag close) for obvious reasons
320
            . '[^\s\042\047>/=]+' // attribute characters
321
            // optional attribute-value
322
                . '(?:\s*=' // attribute-value separator
323
                    . '(?:[^\s\042\047=><`]+|\s*\042[^\042]*\042|\s*\047[^\047]'
324
                . '*\047|\s*(?U:[^\s\042\047=><`]*))' // single, double or non-quoted value
325
                . ')?' // end optional attribute-value group
326
            . ')*)' // end optional attributes group
327
            . '[^>]*)(?<closeTag>\>)?#isS';
328
329
        // Note: It would be nice to optimize this for speed, BUT
330
        // only matching the naughty elements here results in
331
        // false positives and in turn - vulnerabilities!
332
        do {
333
            $oldStr = $str;
334
            $str = (string) preg_replace_callback(
335
                $pattern,
336
                [$this, 'sanitizeNaughtyHtml'],
337
                $str
338
            );
339
        } while ($oldStr !== $str);
340
        unset($oldStr);
341
342
        /*
343
         * Sanitize naughty scripting elements
344
         *
345
         * Similar to above, only instead of looking for
346
         * tags it looks for PHP and JavaScript commands
347
         * that are disallowed. Rather than removing the
348
         * code, it simply converts the parenthesis to entities
349
         * rendering the code un-executable.
350
         *
351
         * For example: eval('some code')
352
         * Becomes: eval&#40;'some code'&#41;
353
         */
354
        $str = (string) preg_replace(
355
            '#(alert|prompt|confirm|cmd|passthru|eval|exec|expression|system|'
356
                . 'fopen|fsockopen|file|file_get_contents|readfile|unlink)(\s*)\((.*?)\)#si',
357
            '\\1\\2&#40;\\3&#41;',
358
            $str
359
        );
360
361
        // Final clean up
362
        // This adds a bit of extra precaution in case
363
        // something got through the above filters
364
        $str = $this->removeForbiddenStrings($str);
365
366
        /*
367
         * Images are Handled in a Special Way
368
         * - Essentially, we want to know that after all of the character
369
         * conversion is done whether any unwanted, likely XSS, code was found.
370
         * If not, we return TRUE, as the image is clean.
371
         * However, if the string post-conversion does not matched the
372
         * string post-removal of XSS, then it fails, as there was unwanted XSS
373
         * code found and removed/changed during processing.
374
         */
375
        if ($isImage) {
376
            return ($str === $convertedString);
377
        }
378
379
        return $str;
380
    }
381
382
    /**
383
     * Generate the XSS hash if not yet generated
384
     * and return it
385
     * @return string
386
     */
387
    public function getXssHash(): string
388
    {
389
        if (empty($this->xssHash)) {
390
            $this->xssHash = Str::random(16);
391
        }
392
393
        return $this->xssHash;
394
    }
395
396
397
    /**
398
     * Return the character set
399
     * @return string
400
     */
401
    public function getCharset(): string
402
    {
403
        return $this->charset;
404
    }
405
406
    /**
407
     * Set the character set
408
     * @param string $charset
409
     * @return $this
410
     */
411
    public function setCharset(string $charset): self
412
    {
413
        $this->charset = $charset;
414
        return $this;
415
    }
416
417
    /**
418
     * Sanitize the filename
419
     * @param string $str
420
     * @param bool $isRelativePath whether to preserve path
421
     * @return string
422
     */
423
    public function sanitizeFilename(string $str, bool $isRelativePath = false): string
424
    {
425
        $invalids = $this->invalidFilenameChars;
426
        if ($isRelativePath === false) {
427
            $invalids[] = './';
428
            $invalids[] = '/';
429
        }
430
431
        $cleanStr = $this->removeInvisibleCharacters($str, false);
432
        do {
433
            $old = $cleanStr;
434
            $cleanStr = str_replace($invalids, '', $cleanStr);
435
        } while ($old !== $cleanStr);
436
437
        return stripslashes($cleanStr);
438
    }
439
440
    /**
441
     * Remove the "img" tags
442
     * @param string $str
443
     * @return string
444
     */
445
    public function stripImageTags(string $str): string
446
    {
447
        return (string) preg_replace(
448
            [
449
                '#<img[\s/]+.*?src\s*=\s*(["\'])([^\\1]+?)\\1.*?\>#i',
450
            '#<img[\s/]+.*?src\s*=\s*?(([^\s"\'=<>`]+)).*?\>#i',
451
            ],
452
            '\\2',
453
            $str
454
        );
455
    }
456
457
    /**
458
     * HTML Entities Decode
459
     * A replacement for html_entity_decode()
460
     *
461
     * The reason we are not using html_entity_decode() by itself is because
462
     * while it is not technically correct to leave out the semicolon
463
     * at the end of an entity most browsers will still interpret the entity
464
     * correctly. html_entity_decode() does not convert entities without
465
     * semicolons, so we are left with our own little solution here. Bummer.
466
     *
467
     * @param string $str
468
     * @param string|null $charset the custom character set if not will use the current one
469
     * @return string
470
     */
471
    protected function htmlEntityDecode(string $str, ?string $charset = null): string
472
    {
473
        if (strpos($str, '&') === false) {
474
            return $str;
475
        }
476
477
        static $entities;
478
479
        if ($charset === null) {
480
            $charset = $this->charset;
481
        }
482
483
        $flag = ENT_COMPAT | ENT_HTML5;
484
485
        if (! isset($entities)) {
486
            $entities = array_map(
487
                'strtolower',
488
                get_html_translation_table(HTML_ENTITIES, $flag, $charset)
489
            );
490
        }
491
492
        do {
493
            $strCompare = $str;
494
495
            // Decode standard entities, avoiding false positives
496
            $matches = [];
497
            if (preg_match_all('/&[a-z]{2,}(?![a-z;])/i', $str, $matches) > 0) {
498
                $replace = [];
499
                $matches = array_unique(array_map('strtolower', $matches[0]));
500
                foreach ($matches as &$match) {
501
                    if (($char = array_search($match . ';', $entities, true)) !== false) {
502
                        $replace[$match] = $char;
503
                    }
504
                }
505
506
                $strReplace = str_replace(array_keys($replace), array_values($replace), $str);
507
508
                // Decode numeric & UTF16 two byte entities
509
                $str = html_entity_decode(
510
                    (string) preg_replace(
511
                        '/(&#(?:x0*[0-9a-f]{2,5}(?![0-9a-f;])|(?:0*\d{2,4}(?![0-9;]))))/iS',
512
                        '$1;',
513
                        $strReplace
514
                    ),
515
                    $flag,
516
                    $charset
517
                );
518
            }
519
        } while ($strCompare !== $str);
520
521
        return $str;
522
    }
523
524
525
526
    /**
527
     * The URL decode taking space into account
528
     * @param array<int, string> $matches
529
     * @return string
530
     */
531
    protected function urlDecodeSpaces(array $matches): string
532
    {
533
        $input = $matches[0];
534
        $noSpace = (string) preg_replace('#\s+#', '', $input);
535
536
        return $noSpace === $input
537
                ? $input
538
                : rawurldecode($noSpace);
539
    }
540
541
    /**
542
     * Compact exploded words (remove white space from string like 'j a v a s c r i p t')
543
     * @param array<int, string> $matches
544
     * @return string
545
     */
546
    protected function compactExplodedWords(array $matches): string
547
    {
548
        return (string) preg_replace('/\s+/s', '', $matches[1]) . $matches[2];
549
    }
550
551
    /**
552
     * Sanitize the string to remove naughty HTML elements
553
     * @param array<int|string, string> $matches
554
     * @return string
555
     */
556
    protected function sanitizeNaughtyHtml(array $matches): string
557
    {
558
        static $naughtyTags = [
559
            'alert', 'area', 'prompt', 'confirm', 'applet', 'audio', 'basefont',
560
            'base', 'behavior', 'bgsound', 'blink', 'body', 'embed', 'expression',
561
            'form', 'frameset', 'frame', 'head', 'html', 'ilayer','iframe', 'input',
562
            'button', 'select', 'isindex', 'layer', 'link', 'meta', 'keygen', 'object',
563
            'plaintext', 'style', 'script', 'textarea', 'title', 'math', 'video', 'svg',
564
            'xml', 'xss'
565
        ];
566
567
        static $evilAttributes = [
568
            'on\w+', 'style', 'xmlns', 'formaction',
569
            'form', 'xlink:href', 'FSCommand', 'seekSegmentTime'
570
        ];
571
572
        // First, escape unclosed tags
573
        if (empty($matches['closeTag'])) {
574
            return '&lt;' . $matches[1];
575
        } elseif (in_array(strtolower($matches['tagName']), $naughtyTags, true)) {
576
            // Is the element that we caught naughty? If so, escape it
577
            return '&lt;' . $matches[1] . '&gt;';
578
        } elseif (!empty($matches['attributes'])) {
579
            // For other tags, see if their attributes are "evil" and strip those
580
            // We'll store the already fitlered attributes here
581
            $attributes = [];
582
583
            $attributesPattern = '#'
584
                    . '(?<name>[^\s\042\047>/=]+)' // attribute characters
585
                    // optional attribute-value
586
                    . '(?:\s*=(?<value>[^\s\042\047=><`]+|\s*\042[^\042]*\042|'
587
                    . '\s*\047[^\047]*\047|\s*(?U:[^\s\042\047=><`]*)))'
588
                    // attribute-value separator;
589
                    . '#i';
590
591
            // Blacklist pattern for evil attribute names
592
            $isEvilPattern = '#^(' . implode('|', $evilAttributes) . ')$#i';
593
            // Each iteration filters a single attribute
594
            do {
595
                // Strip any non-alpha characters that may precede an attribute.
596
                // Browsers often parse these incorrectly and that has been a
597
                // of numerous XSS issues we've had.
598
                $matches['attributes'] = (string) preg_replace(
599
                    '#^[^a-z]+#i',
600
                    '',
601
                    $matches['attributes']
602
                );
603
                $attribute = [];
604
                if (
605
                    ! preg_match(
606
                        $attributesPattern,
607
                        $matches['attributes'],
608
                        $attribute,
609
                        PREG_OFFSET_CAPTURE
610
                    )
611
                ) {
612
                    // No (valid) attribute found? Discard everything else inside the tag
613
                    break;
614
                }
615
616
                if (
617
                    // Is it indeed an "evil" attribute?
618
                    preg_match($isEvilPattern, $attribute['name'][0]) ||
619
                    // Or does it have an equals sign, but no value and not quoted? Strip that too!
620
                    trim($attribute['value'][0]) === ''
621
                ) {
622
                    $attributes[] = 'xss=removed';
623
                } else {
624
                    $attributes[] = $attribute[0][0];
625
                }
626
627
                $matches['attributes']  = (string) substr(
628
                    $matches['attributes'],
629
                    $attribute[0][1] + strlen($attribute[0][0])
630
                );
631
            } while ($matches['attributes'] !== '');
632
633
            $result = count($attributes) > 0
634
                    ? ''
635
                    : ' ' . implode(' ', $attributes);
636
637
            return '<' . $matches['slash'] . $matches['tagName'] . $result . '>';
638
        }
639
640
        return $matches[0];
641
    }
642
643
    /**
644
     * Remove the JS link from the string
645
     * @param array<int, string> $matches
646
     * @return string
647
     */
648
    protected function removeJsLink(array $matches): string
649
    {
650
        return str_replace(
651
            $matches[1],
652
            (string) preg_replace(
653
                '#href=.*?(?:(?:alert|prompt|confirm)(?:\(|&\#40;)|javascript:'
654
                    . '|livescript:|mocha:|charset=|window\.|document\.|\.cookie'
655
                    . '|<script|<xss|d\s*a\s*t\s*a\s*:)#si',
656
                '',
657
                $this->filterAttributes($matches[1])
658
            ),
659
            $matches[0]
660
        );
661
    }
662
663
    /**
664
     * Remove the JS from image tags
665
     * @param array<int, string> $matches
666
     * @return string
667
     */
668
    protected function removeJsImage(array $matches): string
669
    {
670
        return str_replace(
671
            $matches[1],
672
            (string) preg_replace(
673
                '#src=.*?(?:(?:alert|prompt|confirm|eval)(?:\(|&\#40;)|javascript:'
674
                    . '|livescript:|mocha:|charset=|window\.|document\.|\.cooki'
675
                    . 'e|<script|<xss|base64\s*,)#si',
676
                '',
677
                $this->filterAttributes($matches[1])
678
            ),
679
            $matches[0]
680
        );
681
    }
682
683
    /**
684
     * The HTML entities decode callback
685
     * @param array<int, string> $matches
686
     * @return string
687
     */
688
    protected function decodeEntity(array $matches): string
689
    {
690
        // Protect GET variables in URLs like 901119URL5918AMP18930PROTECT8198
691
        $str = (string) preg_replace(
692
            '|\&([a-z\_0-9\-]+)\=([a-z\_0-9\-/]+)|i',
693
            $this->getXssHash() . '\\1=\\2',
694
            $matches[0]
695
        );
696
        // Decode, then un-protect URL GET vars
697
        return str_replace(
698
            $this->getXssHash(),
699
            '&',
700
            $this->htmlEntityDecode($str)
701
        );
702
    }
703
704
    /**
705
     * Convert the attribute
706
     * @param array<int, mixed> $matches
707
     * @return string
708
     */
709
    protected function convertAttribute(array $matches): string
710
    {
711
        return str_replace(
712
            ['>', '<', '\\'],
713
            ['&gt;', '&lt;', '\\\\'],
714
            $matches[0]
715
        );
716
    }
717
718
    /**
719
     *  Filter tag attributes for consistency and safety.
720
     * @param string $str
721
     * @return string
722
     */
723
    protected function filterAttributes(string $str): string
724
    {
725
        $result = '';
726
        $matches = [];
727
        if (
728
            preg_match_all(
729
                '#\s*[a-z\-]+\s*=\s*(\042|\047)([^\\1]*?)\\1#is',
730
                $str,
731
                $matches
732
            ) > 0
733
        ) {
734
            foreach ($matches[0] as $match) {
735
                $result .= (string) preg_replace('#/\*.*?\*/#s', '', $match);
736
            }
737
        }
738
739
        return $result;
740
    }
741
742
    /**
743
     * Remove the forbidden strings
744
     * @param string $str
745
     * @return string
746
     */
747
    protected function removeForbiddenStrings(string $str): string
748
    {
749
        $keys = array_keys($this->forbiddenStrings);
750
        $values = array_values($this->forbiddenStrings);
751
752
        $cleanStr = str_replace($keys, $values, $str);
753
        foreach ($this->forbiddenStringPatterns as $regex) {
754
            $cleanStr = (string) preg_replace('#' . $regex . '#is', '[removed]', $cleanStr);
755
        }
756
757
        return $cleanStr;
758
    }
759
760
    /**
761
     * Remove invisible characters
762
     * This prevents sandwiching null characters
763
     * between ASCII characters, like Java\0script.
764
     *
765
     * @param string $str
766
     * @param bool $urlEncode
767
     * @return string
768
     */
769
    protected function removeInvisibleCharacters(string $str, bool $urlEncode = true): string
770
    {
771
        $nonDisplayables = [];
772
773
        /* Every control character except newline (dec 10),
774
        *  carriage return (dec 13) and horizontal tab (dec 09)
775
        */
776
777
        if ($urlEncode) {
778
            $nonDisplayables[] = '/%0[0-8bcef]/i';  // URL encoded 00-08, 11, 12, 14, 15
779
            $nonDisplayables[] = '/%1[0-9a-f]/i';   // URL encoded 16-31
780
            $nonDisplayables[] = '/%7f/i';      // URL encoded 127
781
        }
782
783
        $nonDisplayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127
784
        $count = 0;
785
        do {
786
            $str = (string) preg_replace($nonDisplayables, '', $str, -1, $count);
787
        } while ($count > 0);
788
789
        return $str;
790
    }
791
}
792