Complex classes like TSafeHtmlParser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use TSafeHtmlParser, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 58 | class TSafeHtmlParser |
||
| 59 | { |
||
| 60 | /** |
||
| 61 | * Storage for resulting HTML output |
||
| 62 | * |
||
| 63 | * @var string |
||
| 64 | * @access private |
||
| 65 | */ |
||
| 66 | private $_xhtml = ''; |
||
| 67 | |||
| 68 | /** |
||
| 69 | * Array of counters for each tag |
||
| 70 | * |
||
| 71 | * @var array |
||
| 72 | * @access private |
||
| 73 | */ |
||
| 74 | private $_counter = array(); |
||
| 75 | |||
| 76 | /** |
||
| 77 | * Stack of unclosed tags |
||
| 78 | * |
||
| 79 | * @var array |
||
| 80 | * @access private |
||
| 81 | */ |
||
| 82 | private $_stack = array(); |
||
| 83 | |||
| 84 | /** |
||
| 85 | * Array of counters for tags that must be deleted with all content |
||
| 86 | * |
||
| 87 | * @var array |
||
| 88 | * @access private |
||
| 89 | */ |
||
| 90 | private $_dcCounter = array(); |
||
| 91 | |||
| 92 | /** |
||
| 93 | * Stack of unclosed tags that must be deleted with all content |
||
| 94 | * |
||
| 95 | * @var array |
||
| 96 | * @access private |
||
| 97 | */ |
||
| 98 | private $_dcStack = array(); |
||
| 99 | |||
| 100 | /** |
||
| 101 | * Stores level of list (ol/ul) nesting |
||
| 102 | * |
||
| 103 | * @var int |
||
| 104 | * @access private |
||
| 105 | */ |
||
| 106 | private $_listScope = 0; |
||
| 107 | |||
| 108 | /** |
||
| 109 | * Stack of unclosed list tags |
||
| 110 | * |
||
| 111 | * @var array |
||
| 112 | * @access private |
||
| 113 | */ |
||
| 114 | private $_liStack = array(); |
||
| 115 | |||
| 116 | /** |
||
| 117 | * Array of prepared regular expressions for protocols (schemas) matching |
||
| 118 | * |
||
| 119 | * @var array |
||
| 120 | * @access private |
||
| 121 | */ |
||
| 122 | private $_protoRegexps = array(); |
||
| 123 | |||
| 124 | /** |
||
| 125 | * Array of prepared regular expressions for CSS matching |
||
| 126 | * |
||
| 127 | * @var array |
||
| 128 | * @access private |
||
| 129 | */ |
||
| 130 | private $_cssRegexps = array(); |
||
| 131 | |||
| 132 | /** |
||
| 133 | * List of single tags ("<tag />") |
||
| 134 | * |
||
| 135 | * @var array |
||
| 136 | * @access public |
||
| 137 | */ |
||
| 138 | public $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', ); |
||
| 139 | |||
| 140 | /** |
||
| 141 | * List of dangerous tags (such tags will be deleted) |
||
| 142 | * |
||
| 143 | * @var array |
||
| 144 | * @access public |
||
| 145 | */ |
||
| 146 | public $deleteTags = array( |
||
| 147 | 'applet', 'base', 'basefont', 'bgsound', 'blink', 'body', |
||
| 148 | 'embed', 'frame', 'frameset', 'head', 'html', 'ilayer', |
||
| 149 | 'iframe', 'layer', 'link', 'meta', 'object', 'style', |
||
| 150 | 'title', 'script', |
||
| 151 | ); |
||
| 152 | |||
| 153 | /** |
||
| 154 | * List of dangerous tags (such tags will be deleted, and all content |
||
| 155 | * inside this tags will be also removed) |
||
| 156 | * |
||
| 157 | * @var array |
||
| 158 | * @access public |
||
| 159 | */ |
||
| 160 | public $deleteTagsContent = array('script', 'style', 'title', 'xml', ); |
||
| 161 | |||
| 162 | /** |
||
| 163 | * Type of protocols filtering ('white' or 'black') |
||
| 164 | * |
||
| 165 | * @var string |
||
| 166 | * @access public |
||
| 167 | */ |
||
| 168 | public $protocolFiltering = 'white'; |
||
| 169 | |||
| 170 | /** |
||
| 171 | * List of "dangerous" protocols (used for blacklist-filtering) |
||
| 172 | * |
||
| 173 | * @var array |
||
| 174 | * @access public |
||
| 175 | */ |
||
| 176 | public $blackProtocols = array( |
||
| 177 | 'about', 'chrome', 'data', 'disk', 'hcp', |
||
| 178 | 'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec', |
||
| 179 | 'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera', |
||
| 180 | 'res', 'resource', 'shell', 'vbscript', 'view-source', |
||
| 181 | 'vnd.ms.radio', 'wysiwyg', |
||
| 182 | ); |
||
| 183 | |||
| 184 | /** |
||
| 185 | * List of "safe" protocols (used for whitelist-filtering) |
||
| 186 | * |
||
| 187 | * @var array |
||
| 188 | * @access public |
||
| 189 | */ |
||
| 190 | public $whiteProtocols = array( |
||
| 191 | 'ed2k', 'file', 'ftp', 'gopher', 'http', 'https', |
||
| 192 | 'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal', |
||
| 193 | 'xmpp', 'callto', |
||
| 194 | ); |
||
| 195 | |||
| 196 | /** |
||
| 197 | * List of attributes that can contain protocols |
||
| 198 | * |
||
| 199 | * @var array |
||
| 200 | * @access public |
||
| 201 | */ |
||
| 202 | public $protocolAttributes = array( |
||
| 203 | 'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src', |
||
| 204 | ); |
||
| 205 | |||
| 206 | /** |
||
| 207 | * List of dangerous CSS keywords |
||
| 208 | * |
||
| 209 | * Whole style="" attribute will be removed, if parser will find one of |
||
| 210 | * these keywords |
||
| 211 | * |
||
| 212 | * @var array |
||
| 213 | * @access public |
||
| 214 | */ |
||
| 215 | public $cssKeywords = array( |
||
| 216 | 'absolute', 'behavior', 'behaviour', 'content', 'expression', |
||
| 217 | 'fixed', 'include-source', 'moz-binding', |
||
| 218 | ); |
||
| 219 | |||
| 220 | /** |
||
| 221 | * List of tags that can have no "closing tag" |
||
| 222 | * |
||
| 223 | * @var array |
||
| 224 | * @access public |
||
| 225 | * @deprecated XHTML does not allow such tags |
||
| 226 | */ |
||
| 227 | public $noClose = array(); |
||
| 228 | |||
| 229 | /** |
||
| 230 | * List of block-level tags that terminates paragraph |
||
| 231 | * |
||
| 232 | * Paragraph will be closed when this tags opened |
||
| 233 | * |
||
| 234 | * @var array |
||
| 235 | * @access public |
||
| 236 | */ |
||
| 237 | public $closeParagraph = array( |
||
| 238 | 'address', 'blockquote', 'center', 'dd', 'dir', 'div', |
||
| 239 | 'dl', 'dt', 'h1', 'h2', 'h3', 'h4', |
||
| 240 | 'h5', 'h6', 'hr', 'isindex', 'listing', 'marquee', |
||
| 241 | 'menu', 'multicol', 'ol', 'p', 'plaintext', 'pre', |
||
| 242 | 'table', 'ul', 'xmp', |
||
| 243 | ); |
||
| 244 | |||
| 245 | /** |
||
| 246 | * List of table tags, all table tags outside a table will be removed |
||
| 247 | * |
||
| 248 | * @var array |
||
| 249 | * @access public |
||
| 250 | */ |
||
| 251 | public $tableTags = array( |
||
| 252 | 'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', |
||
| 253 | 'thead', 'tr', |
||
| 254 | ); |
||
| 255 | |||
| 256 | /** |
||
| 257 | * List of list tags |
||
| 258 | * |
||
| 259 | * @var array |
||
| 260 | * @access public |
||
| 261 | */ |
||
| 262 | public $listTags = array('dir', 'menu', 'ol', 'ul', 'dl', ); |
||
| 263 | |||
| 264 | /** |
||
| 265 | * List of dangerous attributes |
||
| 266 | * |
||
| 267 | * @var array |
||
| 268 | * @access public |
||
| 269 | */ |
||
| 270 | public $attributes = array('dynsrc'); |
||
| 271 | //public $attributes = array('dynsrc', 'id', 'name', ); //id and name are dangerous? |
||
| 272 | |||
| 273 | /** |
||
| 274 | * List of allowed "namespaced" attributes |
||
| 275 | * |
||
| 276 | * @var array |
||
| 277 | * @access public |
||
| 278 | */ |
||
| 279 | public $attributesNS = array('xml:lang', ); |
||
| 280 | |||
| 281 | /** |
||
| 282 | * Constructs class |
||
| 283 | * |
||
| 284 | * @access public |
||
| 285 | */ |
||
| 286 | public function __construct() |
||
| 303 | |||
| 304 | /** |
||
| 305 | * Handles the writing of attributes - called from $this->_openHandler() |
||
| 306 | * |
||
| 307 | * @param array $attrs array of attributes $name => $value |
||
| 308 | * @return boolean |
||
| 309 | * @access private |
||
| 310 | */ |
||
| 311 | private function _writeAttrs ($attrs) |
||
| 405 | |||
| 406 | /** |
||
| 407 | * Opening tag handler - called from HTMLSax |
||
| 408 | * |
||
| 409 | * @param object $parser HTML Parser |
||
| 410 | * @param string $name tag name |
||
| 411 | * @param array $attrs tag attributes |
||
| 412 | * @return boolean |
||
| 413 | * @access private |
||
| 414 | */ |
||
| 415 | public function _openHandler(&$parser, $name, $attrs) |
||
| 479 | |||
| 480 | /** |
||
| 481 | * Closing tag handler - called from HTMLSax |
||
| 482 | * |
||
| 483 | * @param object $parsers HTML parser |
||
| 484 | * @param string $name tag name |
||
| 485 | * @return boolean |
||
| 486 | * @access private |
||
| 487 | */ |
||
| 488 | public function _closeHandler(&$parser, $name) |
||
| 516 | |||
| 517 | /** |
||
| 518 | * Closes tag |
||
| 519 | * |
||
| 520 | * @param string $tag tag name |
||
| 521 | * @return boolean |
||
| 522 | * @access private |
||
| 523 | */ |
||
| 524 | public function _closeTag($tag) |
||
| 541 | |||
| 542 | /** |
||
| 543 | * Character data handler - called from HTMLSax |
||
| 544 | * |
||
| 545 | * @param object $parser HTML parser |
||
| 546 | * @param string $data textual data |
||
| 547 | * @return boolean |
||
| 548 | * @access private |
||
| 549 | */ |
||
| 550 | public function _dataHandler(&$parser, $data) |
||
| 557 | |||
| 558 | /** |
||
| 559 | * Escape handler - called from HTMLSax |
||
| 560 | * |
||
| 561 | * @param object $parser HTML parser |
||
| 562 | * @param string $data comments or other type of data |
||
| 563 | * @return boolean |
||
| 564 | * @access private |
||
| 565 | */ |
||
| 566 | public function _escapeHandler(&$parser, $data) |
||
| 570 | |||
| 571 | /** |
||
| 572 | * Returns the XHTML document |
||
| 573 | * |
||
| 574 | * @return string Processed (X)HTML document |
||
| 575 | * @access public |
||
| 576 | */ |
||
| 577 | public function getXHTML () |
||
| 585 | |||
| 586 | /** |
||
| 587 | * Clears current document data |
||
| 588 | * |
||
| 589 | * @return boolean |
||
| 590 | * @access public |
||
| 591 | */ |
||
| 592 | public function clear() |
||
| 597 | |||
| 598 | /** |
||
| 599 | * Main parsing fuction |
||
| 600 | * |
||
| 601 | * @param string $doc HTML document for processing |
||
| 602 | * @return string Processed (X)HTML document |
||
| 603 | * @access public |
||
| 604 | */ |
||
| 605 | public function parse($doc, $isUTF7=false) |
||
| 637 | |||
| 638 | |||
| 639 | /** |
||
| 640 | * UTF-7 decoding fuction |
||
| 641 | * |
||
| 642 | * @param string $str HTML document for recode ASCII part of UTF-7 back to ASCII |
||
| 643 | * @return string Decoded document |
||
| 644 | * @access private |
||
| 645 | */ |
||
| 646 | private function repackUTF7($str) |
||
| 650 | |||
| 651 | /** |
||
| 652 | * Additional UTF-7 decoding fuction |
||
| 653 | * |
||
| 654 | * @param string $str String for recode ASCII part of UTF-7 back to ASCII |
||
| 655 | * @return string Recoded string |
||
| 656 | * @access private |
||
| 657 | */ |
||
| 658 | private function repackUTF7Callback($str) |
||
| 664 | |||
| 665 | /** |
||
| 666 | * Additional UTF-7 encoding fuction |
||
| 667 | * |
||
| 668 | * @param string $str String for recode ASCII part of UTF-7 back to ASCII |
||
| 669 | * @return string Recoded string |
||
| 670 | * @access private |
||
| 671 | */ |
||
| 672 | private function repackUTF7Back($str) |
||
| 676 | } |
||
| 677 | |||
| 686 |