Completed
Push — prado-3.3 ( e90646...0b76d5 )
by Fabio
23:37 queued 03:01
created

TSafeHtmlParser::_writeAttrs()   D

Complexity

Conditions 23
Paths 2

Size

Total Lines 94

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 23
nc 2
nop 1
dl 0
loc 94
rs 4.1666
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
/* vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: */
3
4
/**
5
 * SafeHTML Parser
6
 *
7
 * PHP versions 4 and 5
8
 *
9
 * @category   HTML
10
 * @package    System.Security
11
 * @author     Roman Ivanov <[email protected]>
12
 * @copyright  2004-2005 Roman Ivanov
13
 * @license    http://www.debian.org/misc/bsd.license  BSD License (3 Clause)
14
 * @version    1.3.7
15
 * @link       http://pixel-apes.com/safehtml/
16
 */
17
18
19
/**
20
 * This package requires HTMLSax3 package
21
 */
22
Prado::using('System.3rdParty.SafeHtml.HTMLSax3');
23
24
25
/**
26
 *
27
 * TSafeHtmlParser
28
 *
29
 * This parser strips down all potentially dangerous content within HTML:
30
 * <ul>
31
 * <li>opening tag without its closing tag</li>
32
 * <li>closing tag without its opening tag</li>
33
 * <li>any of these tags: "base", "basefont", "head", "html", "body", "applet",
34
 * "object", "iframe", "frame", "frameset", "script", "layer", "ilayer", "embed",
35
 * "bgsound", "link", "meta", "style", "title", "blink", "xml" etc.</li>
36
 * <li>any of these attributes: on*, data*, dynsrc</li>
37
 * <li>javascript:/vbscript:/about: etc. protocols</li>
38
 * <li>expression/behavior etc. in styles</li>
39
 * <li>any other active content</li>
40
 * </ul>
41
 * It also tries to convert code to XHTML valid, but htmltidy is far better
42
 * solution for this task.
43
 *
44
 * <b>Example:</b>
45
 * <pre>
46
 * $parser = Prado::createComponent('System.3rdParty.SafeHtml.TSafeHtmlParser');
47
 * $result = $parser->parse($doc);
48
 * </pre>
49
 *
50
 * @category   HTML
51
 * @package    System.Security
52
 * @author     Roman Ivanov <[email protected]>
53
 * @copyright  1997-2005 Roman Ivanov
54
 * @license    http://www.debian.org/misc/bsd.license  BSD License (3 Clause)
55
 * @version    Release: @package_version@
56
 * @link       http://pear.php.net/package/SafeHTML
57
 */
58
class TSafeHtmlParser
59
{
60
    /**
61
     * Storage for resulting HTML output
62
     *
63
     * @var string
64
     * @access private
65
     */
66
    private $_xhtml = '';
67
68
    /**
69
     * Array of counters for each tag
70
     *
71
     * @var array
72
     * @access private
73
     */
74
    private $_counter = array();
75
76
    /**
77
     * Stack of unclosed tags
78
     *
79
     * @var array
80
     * @access private
81
     */
82
    private $_stack = array();
83
84
    /**
85
     * Array of counters for tags that must be deleted with all content
86
     *
87
     * @var array
88
     * @access private
89
     */
90
    private $_dcCounter = array();
91
92
    /**
93
     * Stack of unclosed tags that must be deleted with all content
94
     *
95
     * @var array
96
     * @access private
97
     */
98
    private $_dcStack = array();
99
100
    /**
101
     * Stores level of list (ol/ul) nesting
102
     *
103
     * @var int
104
     * @access private
105
     */
106
    private $_listScope = 0;
107
108
    /**
109
     * Stack of unclosed list tags
110
     *
111
     * @var array
112
     * @access private
113
     */
114
    private $_liStack = array();
115
116
    /**
117
     * Array of prepared regular expressions for protocols (schemas) matching
118
     *
119
     * @var array
120
     * @access private
121
     */
122
    private $_protoRegexps = array();
123
124
    /**
125
     * Array of prepared regular expressions for CSS matching
126
     *
127
     * @var array
128
     * @access private
129
     */
130
    private $_cssRegexps = array();
131
132
    /**
133
     * List of single tags ("<tag />")
134
     *
135
     * @var array
136
     * @access public
137
     */
138
    public $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', );
139
140
    /**
141
     * List of dangerous tags (such tags will be deleted)
142
     *
143
     * @var array
144
     * @access public
145
     */
146
    public $deleteTags = array(
147
        'applet', 'base',   'basefont', 'bgsound', 'blink',  'body',
148
        'embed',  'frame',  'frameset', 'head',    'html',   'ilayer',
149
        'iframe', 'layer',  'link',     'meta',    'object', 'style',
150
        'title',  'script',
151
        );
152
153
    /**
154
     * List of dangerous tags (such tags will be deleted, and all content
155
     * inside this tags will be also removed)
156
     *
157
     * @var array
158
     * @access public
159
     */
160
    public $deleteTagsContent = array('script', 'style', 'title', 'xml', );
161
162
    /**
163
     * Type of protocols filtering ('white' or 'black')
164
     *
165
     * @var string
166
     * @access public
167
     */
168
    public $protocolFiltering = 'white';
169
170
    /**
171
     * List of "dangerous" protocols (used for blacklist-filtering)
172
     *
173
     * @var array
174
     * @access public
175
     */
176
    public $blackProtocols = array(
177
        'about',   'chrome',     'data',       'disk',     'hcp',
178
        'help',    'javascript', 'livescript', 'lynxcgi',  'lynxexec',
179
        'ms-help', 'ms-its',     'mhtml',      'mocha',    'opera',
180
        'res',     'resource',   'shell',      'vbscript', 'view-source',
181
        'vnd.ms.radio',          'wysiwyg',
182
        );
183
184
    /**
185
     * List of "safe" protocols (used for whitelist-filtering)
186
     *
187
     * @var array
188
     * @access public
189
     */
190
    public $whiteProtocols = array(
191
        'ed2k',   'file', 'ftp',  'gopher', 'http',  'https',
192
        'irc',    'mailto', 'news', 'nntp', 'telnet', 'webcal',
193
        'xmpp',   'callto',
194
        );
195
196
    /**
197
     * List of attributes that can contain protocols
198
     *
199
     * @var array
200
     * @access public
201
     */
202
    public $protocolAttributes = array(
203
        'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src',
204
        );
205
206
    /**
207
     * List of dangerous CSS keywords
208
     *
209
     * Whole style="" attribute will be removed, if parser will find one of
210
     * these keywords
211
     *
212
     * @var array
213
     * @access public
214
     */
215
    public $cssKeywords = array(
216
        'absolute', 'behavior',       'behaviour',   'content', 'expression',
217
        'fixed',    'include-source', 'moz-binding',
218
        );
219
220
    /**
221
     * List of tags that can have no "closing tag"
222
     *
223
     * @var array
224
     * @access public
225
     * @deprecated XHTML does not allow such tags
226
     */
227
    public $noClose = array();
228
229
    /**
230
     * List of block-level tags that terminates paragraph
231
     *
232
     * Paragraph will be closed when this tags opened
233
     *
234
     * @var array
235
     * @access public
236
     */
237
    public $closeParagraph = array(
238
        'address', 'blockquote', 'center', 'dd',      'dir',       'div',
239
        'dl',      'dt',         'h1',     'h2',      'h3',        'h4',
240
        'h5',      'h6',         'hr',     'isindex', 'listing',   'marquee',
241
        'menu',    'multicol',   'ol',     'p',       'plaintext', 'pre',
242
        'table',   'ul',         'xmp',
243
        );
244
245
    /**
246
     * List of table tags, all table tags outside a table will be removed
247
     *
248
     * @var array
249
     * @access public
250
     */
251
    public $tableTags = array(
252
        'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
253
        'thead',   'tr',
254
        );
255
256
    /**
257
     * List of list tags
258
     *
259
     * @var array
260
     * @access public
261
     */
262
    public $listTags = array('dir', 'menu', 'ol', 'ul', 'dl', );
263
264
    /**
265
     * List of dangerous attributes
266
     *
267
     * @var array
268
     * @access public
269
     */
270
    public $attributes = array('dynsrc');
271
    //public $attributes = array('dynsrc', 'id', 'name', ); //id and name are dangerous?
272
273
    /**
274
     * List of allowed "namespaced" attributes
275
     *
276
     * @var array
277
     * @access public
278
     */
279
    public $attributesNS = array('xml:lang', );
280
281
    /**
282
     * Constructs class
283
     *
284
     * @access public
285
     */
286
    public function __construct()
287
    {
288
        //making regular expressions based on Proto & CSS arrays
289
        foreach ($this->blackProtocols as $proto) {
290
            $preg = "/[\s\x01-\x1F]*";
291
            for ($i=0; $i<strlen($proto); $i++) {
292
                $preg .= $proto{$i} . "[\s\x01-\x1F]*";
293
            }
294
            $preg .= ":/i";
295
            $this->_protoRegexps[] = $preg;
296
        }
297
298
        foreach ($this->cssKeywords as $css) {
299
            $this->_cssRegexps[] = '/' . $css . '/i';
300
        }
301
        return true;
0 ignored issues
show
Bug introduced by
Constructors do not have meaningful return values, anything that is returned from here is discarded. Are you sure this is correct?
Loading history...
302
    }
303
304
    /**
305
     * Handles the writing of attributes - called from $this->_openHandler()
306
     *
307
     * @param array $attrs array of attributes $name => $value
308
     * @return boolean
309
     * @access private
310
     */
311
    private function _writeAttrs ($attrs)
312
    {
313
        if (is_array($attrs)) {
314
            foreach ($attrs as $name => $value) {
315
316
                $name = strtolower($name);
317
318
                if (strpos($name, 'on') === 0) {
319
                    continue;
320
                }
321
                if (strpos($name, 'data') === 0) {
322
                    continue;
323
                }
324
                if (in_array($name, $this->attributes)) {
325
                    continue;
326
                }
327
                if (!preg_match("/^[a-z0-9]+$/i", $name)) {
328
                    if (!in_array($name, $this->attributesNS))
329
                    {
330
                        continue;
331
                    }
332
                }
333
334
                if (($value === TRUE) || (is_null($value))) {
335
                    $value = $name;
336
                }
337
338
                if ($name == 'style') {
339
340
                   // removes insignificant backslahes
341
                   $value = str_replace("\\", '', $value);
342
343
                   // removes CSS comments
344
                   while (1)
345
                   {
346
                     $_value = preg_replace("!/\*.*?\*/!s", '', $value);
347
                     if ($_value == $value) break;
348
                     $value = $_value;
349
                   }
350
351
                   // replace all & to &amp;
352
                   $value = str_replace('&amp;', '&', $value);
353
                   $value = str_replace('&', '&amp;', $value);
354
355
                   foreach ($this->_cssRegexps as $css) {
356
                       if (preg_match($css, $value)) {
357
                           continue 2;
358
                       }
359
                   }
360
                   foreach ($this->_protoRegexps as $proto) {
361
                       if (preg_match($proto, $value)) {
362
                           continue 2;
363
                       }
364
                   }
365
                }
366
367
                $tempval = preg_replace_callback(
368
                        '/&#(\d+);?/m',
369
                        function ($matches) {
370
                            return chr($matches[0]);
371
                        },
372
                        $value
373
                    ); //"'
374
375
                $tempval = preg_replace_callback(
376
                        '/&#x([0-9a-f]+);?/mi',
377
                        function ($matches) {
378
                            return chr(hexdec($matches[0]));
379
                        },
380
                        $tempval
381
                    );
382
383
                if ((in_array($name, $this->protocolAttributes)) &&
384
                    (strpos($tempval, ':') !== false))
385
                {
386
                    if ($this->protocolFiltering == 'black') {
387
                        foreach ($this->_protoRegexps as $proto) {
388
                            if (preg_match($proto, $tempval)) continue 2;
389
                        }
390
                    } else {
391
                        $_tempval = explode(':', $tempval);
392
                        $proto = $_tempval[0];
393
                        if (!in_array($proto, $this->whiteProtocols)) {
394
                            continue;
395
                        }
396
                    }
397
                }
398
399
                $value = str_replace("\"", "&quot;", $value);
400
                $this->_xhtml .= ' ' . $name . '="' . $value . '"';
401
            }
402
        }
403
        return true;
404
    }
405
406
    /**
407
     * Opening tag handler - called from HTMLSax
408
     *
409
     * @param object $parser HTML Parser
410
     * @param string $name   tag name
411
     * @param array  $attrs  tag attributes
412
     * @return boolean
413
     * @access private
414
     */
415
    public function _openHandler(&$parser, $name, $attrs)
416
    {
417
        $name = strtolower($name);
418
419
        if (in_array($name, $this->deleteTagsContent)) {
420
            array_push($this->_dcStack, $name);
421
            $this->_dcCounter[$name] = isset($this->_dcCounter[$name]) ? $this->_dcCounter[$name]+1 : 1;
422
        }
423
        if (count($this->_dcStack) != 0) {
424
            return true;
425
        }
426
427
        if (in_array($name, $this->deleteTags)) {
428
            return true;
429
        }
430
431
        if (!preg_match("/^[a-z0-9]+$/i", $name)) {
432
            if (preg_match("!(?:\@|://)!i", $name)) {
433
                $this->_xhtml .= '&lt;' . $name . '&gt;';
434
            }
435
            return true;
436
        }
437
438
        if (in_array($name, $this->singleTags)) {
439
            $this->_xhtml .= '<' . $name;
440
            $this->_writeAttrs($attrs);
441
            $this->_xhtml .= ' />';
442
            return true;
443
        }
444
445
        // TABLES: cannot open table elements when we are not inside table
446
        if ((isset($this->_counter['table'])) && ($this->_counter['table'] <= 0)
447
            && (in_array($name, $this->tableTags)))
448
        {
449
            return true;
450
        }
451
452
        // PARAGRAPHS: close paragraph when closeParagraph tags opening
453
        if ((in_array($name, $this->closeParagraph)) && (in_array('p', $this->_stack))) {
454
            $this->_closeHandler($parser, 'p');
455
        }
456
457
        // LISTS: we should close <li> if <li> of the same level opening
458
        if ($name == 'li' && count($this->_liStack) &&
459
            $this->_listScope == $this->_liStack[count($this->_liStack)-1])
460
        {
461
            $this->_closeHandler($parser, 'li');
462
        }
463
464
        // LISTS: we want to know on what nesting level of lists we are
465
        if (in_array($name, $this->listTags)) {
466
            $this->_listScope++;
467
        }
468
        if ($name == 'li') {
469
            array_push($this->_liStack, $this->_listScope);
470
        }
471
472
        $this->_xhtml .= '<' . $name;
473
        $this->_writeAttrs($attrs);
474
        $this->_xhtml .= '>';
475
        array_push($this->_stack,$name);
476
        $this->_counter[$name] = isset($this->_counter[$name]) ? $this->_counter[$name]+1 : 1;
477
        return true;
478
    }
479
480
    /**
481
     * Closing tag handler - called from HTMLSax
482
     *
483
     * @param object $parsers HTML parser
0 ignored issues
show
Documentation introduced by
There is no parameter named $parsers. Did you maybe mean $parser?

This check looks for PHPDoc comments describing methods or function parameters that do not exist on the corresponding method or function. It has, however, found a similar but not annotated parameter which might be a good fit.

Consider the following example. The parameter $ireland is not defined by the method finale(...).

/**
 * @param array $germany
 * @param array $ireland
 */
function finale($germany, $island) {
    return "2:1";
}

The most likely cause is that the parameter was changed, but the annotation was not.

Loading history...
484
     * @param string $name    tag name
485
     * @return boolean
486
     * @access private
487
     */
488
    public function _closeHandler(&$parser, $name)
489
    {
490
491
        $name = strtolower($name);
492
493
        if (isset($this->_dcCounter[$name]) && ($this->_dcCounter[$name] > 0) &&
494
            (in_array($name, $this->deleteTagsContent)))
495
        {
496
           while ($name != ($tag = array_pop($this->_dcStack))) {
497
            $this->_dcCounter[$tag]--;
498
           }
499
500
           $this->_dcCounter[$name]--;
501
        }
502
503
        if (count($this->_dcStack) != 0) {
504
            return true;
505
        }
506
507
        if ((isset($this->_counter[$name])) && ($this->_counter[$name] > 0)) {
508
           while ($name != ($tag = array_pop($this->_stack))) {
509
               $this->_closeTag($tag);
510
           }
511
512
           $this->_closeTag($name);
513
        }
514
        return true;
515
    }
516
517
    /**
518
     * Closes tag
519
     *
520
     * @param string $tag tag name
521
     * @return boolean
522
     * @access private
523
     */
524
    public function _closeTag($tag)
525
    {
526
        if (!in_array($tag, $this->noClose)) {
0 ignored issues
show
Deprecated Code introduced by
The property TSafeHtmlParser::$noClose has been deprecated with message: XHTML does not allow such tags

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
527
            $this->_xhtml .= '</' . $tag . '>';
528
        }
529
530
        $this->_counter[$tag]--;
531
532
        if (in_array($tag, $this->listTags)) {
533
            $this->_listScope--;
534
        }
535
536
        if ($tag == 'li') {
537
            array_pop($this->_liStack);
538
        }
539
        return true;
540
    }
541
542
    /**
543
     * Character data handler - called from HTMLSax
544
     *
545
     * @param object $parser HTML parser
546
     * @param string $data   textual data
547
     * @return boolean
548
     * @access private
549
     */
550
    public function _dataHandler(&$parser, $data)
551
    {
552
        if (count($this->_dcStack) == 0) {
553
            $this->_xhtml .= $data;
554
        }
555
        return true;
556
    }
557
558
    /**
559
     * Escape handler - called from HTMLSax
560
     *
561
     * @param object $parser HTML parser
562
     * @param string $data   comments or other type of data
563
     * @return boolean
564
     * @access private
565
     */
566
    public function _escapeHandler(&$parser, $data)
567
    {
568
        return true;
569
    }
570
571
    /**
572
     * Returns the XHTML document
573
     *
574
     * @return string Processed (X)HTML document
575
     * @access public
576
     */
577
    public function getXHTML ()
578
    {
579
        while ($tag = array_pop($this->_stack)) {
580
            $this->_closeTag($tag);
581
        }
582
583
        return $this->_xhtml;
584
    }
585
586
    /**
587
     * Clears current document data
588
     *
589
     * @return boolean
590
     * @access public
591
     */
592
    public function clear()
593
    {
594
        $this->_xhtml = '';
595
        return true;
596
    }
597
598
    /**
599
     * Main parsing fuction
600
     *
601
     * @param string $doc HTML document for processing
602
     * @return string Processed (X)HTML document
603
     * @access public
604
     */
605
    public function parse($doc, $isUTF7=false)
606
    {
607
	   $this->clear();
608
609
       // Save all '<' symbols
610
       $doc = preg_replace("/<(?=[^a-zA-Z\/\!\?\%])/", '&lt;', (string)$doc);
611
612
       // Web documents shouldn't contains \x00 symbol
613
       $doc = str_replace("\x00", '', $doc);
614
615
       // Opera6 bug workaround
616
       $doc = str_replace("\xC0\xBC", '&lt;', $doc);
617
618
       // UTF-7 encoding ASCII decode
619
       if($isUTF7)
620
            $doc = $this->repackUTF7($doc);
621
622
       // Instantiate the parser
623
       $parser= new TSax3();
624
625
       // Set up the parser
626
       $parser->set_object($this);
627
628
       $parser->set_element_handler('_openHandler','_closeHandler');
629
       $parser->set_data_handler('_dataHandler');
630
       $parser->set_escape_handler('_escapeHandler');
631
632
       $parser->parse($doc);
633
634
       return $this->getXHTML();
635
636
    }
637
638
639
    /**
640
     * UTF-7 decoding fuction
641
     *
642
     * @param string $str HTML document for recode ASCII part of UTF-7 back to ASCII
643
     * @return string Decoded document
644
     * @access private
645
     */
646
    private function repackUTF7($str)
647
    {
648
       return preg_replace_callback('!\+([0-9a-zA-Z/]+)\-!', array($this, 'repackUTF7Callback'), $str);
649
    }
650
651
    /**
652
     * Additional UTF-7 decoding fuction
653
     *
654
     * @param string $str String for recode ASCII part of UTF-7 back to ASCII
655
     * @return string Recoded string
656
     * @access private
657
     */
658
    private function repackUTF7Callback($str)
659
    {
660
       $str = base64_decode($str[1]);
661
       $str = preg_replace_callback('/^((?:\x00.)*)((?:[^\x00].)+)/', array($this, 'repackUTF7Back'), $str);
662
       return preg_replace('/\x00(.)/', '$1', $str);
663
    }
664
665
    /**
666
     * Additional UTF-7 encoding fuction
667
     *
668
     * @param string $str String for recode ASCII part of UTF-7 back to ASCII
669
     * @return string Recoded string
670
     * @access private
671
     */
672
    private function repackUTF7Back($str)
673
    {
674
       return $str[1].'+'.rtrim(base64_encode($str[2]), '=').'-';
675
    }
676
}
677
678
/*
679
 * Local variables:
680
 * tab-width: 4
681
 * c-basic-offset: 4
682
 * c-hanging-comment-ender-p: nil
683
 * End:
684
 */
685
686