Complex classes like TSafeHtmlParser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use TSafeHtmlParser, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 56 | class TSafeHtmlParser |
||
| 57 | { |
||
| 58 | /** |
||
| 59 | * Storage for resulting HTML output |
||
| 60 | * |
||
| 61 | * @var string |
||
| 62 | * @access private |
||
| 63 | */ |
||
| 64 | private $_xhtml = ''; |
||
| 65 | |||
| 66 | /** |
||
| 67 | * Array of counters for each tag |
||
| 68 | * |
||
| 69 | * @var array |
||
| 70 | * @access private |
||
| 71 | */ |
||
| 72 | private $_counter = array(); |
||
| 73 | |||
| 74 | /** |
||
| 75 | * Stack of unclosed tags |
||
| 76 | * |
||
| 77 | * @var array |
||
| 78 | * @access private |
||
| 79 | */ |
||
| 80 | private $_stack = array(); |
||
| 81 | |||
| 82 | /** |
||
| 83 | * Array of counters for tags that must be deleted with all content |
||
| 84 | * |
||
| 85 | * @var array |
||
| 86 | * @access private |
||
| 87 | */ |
||
| 88 | private $_dcCounter = array(); |
||
| 89 | |||
| 90 | /** |
||
| 91 | * Stack of unclosed tags that must be deleted with all content |
||
| 92 | * |
||
| 93 | * @var array |
||
| 94 | * @access private |
||
| 95 | */ |
||
| 96 | private $_dcStack = array(); |
||
| 97 | |||
| 98 | /** |
||
| 99 | * Stores level of list (ol/ul) nesting |
||
| 100 | * |
||
| 101 | * @var int |
||
| 102 | * @access private |
||
| 103 | */ |
||
| 104 | private $_listScope = 0; |
||
| 105 | |||
| 106 | /** |
||
| 107 | * Stack of unclosed list tags |
||
| 108 | * |
||
| 109 | * @var array |
||
| 110 | * @access private |
||
| 111 | */ |
||
| 112 | private $_liStack = array(); |
||
| 113 | |||
| 114 | /** |
||
| 115 | * Array of prepared regular expressions for protocols (schemas) matching |
||
| 116 | * |
||
| 117 | * @var array |
||
| 118 | * @access private |
||
| 119 | */ |
||
| 120 | private $_protoRegexps = array(); |
||
| 121 | |||
| 122 | /** |
||
| 123 | * Array of prepared regular expressions for CSS matching |
||
| 124 | * |
||
| 125 | * @var array |
||
| 126 | * @access private |
||
| 127 | */ |
||
| 128 | private $_cssRegexps = array(); |
||
| 129 | |||
| 130 | /** |
||
| 131 | * List of single tags ("<tag />") |
||
| 132 | * |
||
| 133 | * @var array |
||
| 134 | * @access public |
||
| 135 | */ |
||
| 136 | public $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', ); |
||
| 137 | |||
| 138 | /** |
||
| 139 | * List of dangerous tags (such tags will be deleted) |
||
| 140 | * |
||
| 141 | * @var array |
||
| 142 | * @access public |
||
| 143 | */ |
||
| 144 | public $deleteTags = array( |
||
| 145 | 'applet', 'base', 'basefont', 'bgsound', 'blink', 'body', |
||
| 146 | 'embed', 'frame', 'frameset', 'head', 'html', 'ilayer', |
||
| 147 | 'iframe', 'layer', 'link', 'meta', 'object', 'style', |
||
| 148 | 'title', 'script', |
||
| 149 | ); |
||
| 150 | |||
| 151 | /** |
||
| 152 | * List of dangerous tags (such tags will be deleted, and all content |
||
| 153 | * inside this tags will be also removed) |
||
| 154 | * |
||
| 155 | * @var array |
||
| 156 | * @access public |
||
| 157 | */ |
||
| 158 | public $deleteTagsContent = array('script', 'style', 'title', 'xml', ); |
||
| 159 | |||
| 160 | /** |
||
| 161 | * Type of protocols filtering ('white' or 'black') |
||
| 162 | * |
||
| 163 | * @var string |
||
| 164 | * @access public |
||
| 165 | */ |
||
| 166 | public $protocolFiltering = 'white'; |
||
| 167 | |||
| 168 | /** |
||
| 169 | * List of "dangerous" protocols (used for blacklist-filtering) |
||
| 170 | * |
||
| 171 | * @var array |
||
| 172 | * @access public |
||
| 173 | */ |
||
| 174 | public $blackProtocols = array( |
||
| 175 | 'about', 'chrome', 'data', 'disk', 'hcp', |
||
| 176 | 'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec', |
||
| 177 | 'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera', |
||
| 178 | 'res', 'resource', 'shell', 'vbscript', 'view-source', |
||
| 179 | 'vnd.ms.radio', 'wysiwyg', |
||
| 180 | ); |
||
| 181 | |||
| 182 | /** |
||
| 183 | * List of "safe" protocols (used for whitelist-filtering) |
||
| 184 | * |
||
| 185 | * @var array |
||
| 186 | * @access public |
||
| 187 | */ |
||
| 188 | public $whiteProtocols = array( |
||
| 189 | 'ed2k', 'file', 'ftp', 'gopher', 'http', 'https', |
||
| 190 | 'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal', |
||
| 191 | 'xmpp', 'callto', |
||
| 192 | ); |
||
| 193 | |||
| 194 | /** |
||
| 195 | * List of attributes that can contain protocols |
||
| 196 | * |
||
| 197 | * @var array |
||
| 198 | * @access public |
||
| 199 | */ |
||
| 200 | public $protocolAttributes = array( |
||
| 201 | 'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src', |
||
| 202 | ); |
||
| 203 | |||
| 204 | /** |
||
| 205 | * List of dangerous CSS keywords |
||
| 206 | * |
||
| 207 | * Whole style="" attribute will be removed, if parser will find one of |
||
| 208 | * these keywords |
||
| 209 | * |
||
| 210 | * @var array |
||
| 211 | * @access public |
||
| 212 | */ |
||
| 213 | public $cssKeywords = array( |
||
| 214 | 'absolute', 'behavior', 'behaviour', 'content', 'expression', |
||
| 215 | 'fixed', 'include-source', 'moz-binding', |
||
| 216 | ); |
||
| 217 | |||
| 218 | /** |
||
| 219 | * List of tags that can have no "closing tag" |
||
| 220 | * |
||
| 221 | * @var array |
||
| 222 | * @access public |
||
| 223 | * @deprecated XHTML does not allow such tags |
||
| 224 | */ |
||
| 225 | public $noClose = array(); |
||
| 226 | |||
| 227 | /** |
||
| 228 | * List of block-level tags that terminates paragraph |
||
| 229 | * |
||
| 230 | * Paragraph will be closed when this tags opened |
||
| 231 | * |
||
| 232 | * @var array |
||
| 233 | * @access public |
||
| 234 | */ |
||
| 235 | public $closeParagraph = array( |
||
| 236 | 'address', 'blockquote', 'center', 'dd', 'dir', 'div', |
||
| 237 | 'dl', 'dt', 'h1', 'h2', 'h3', 'h4', |
||
| 238 | 'h5', 'h6', 'hr', 'isindex', 'listing', 'marquee', |
||
| 239 | 'menu', 'multicol', 'ol', 'p', 'plaintext', 'pre', |
||
| 240 | 'table', 'ul', 'xmp', |
||
| 241 | ); |
||
| 242 | |||
| 243 | /** |
||
| 244 | * List of table tags, all table tags outside a table will be removed |
||
| 245 | * |
||
| 246 | * @var array |
||
| 247 | * @access public |
||
| 248 | */ |
||
| 249 | public $tableTags = array( |
||
| 250 | 'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', |
||
| 251 | 'thead', 'tr', |
||
| 252 | ); |
||
| 253 | |||
| 254 | /** |
||
| 255 | * List of list tags |
||
| 256 | * |
||
| 257 | * @var array |
||
| 258 | * @access public |
||
| 259 | */ |
||
| 260 | public $listTags = array('dir', 'menu', 'ol', 'ul', 'dl', ); |
||
| 261 | |||
| 262 | /** |
||
| 263 | * List of dangerous attributes |
||
| 264 | * |
||
| 265 | * @var array |
||
| 266 | * @access public |
||
| 267 | */ |
||
| 268 | public $attributes = array('dynsrc'); |
||
| 269 | //public $attributes = array('dynsrc', 'id', 'name', ); //id and name are dangerous? |
||
| 270 | |||
| 271 | /** |
||
| 272 | * List of allowed "namespaced" attributes |
||
| 273 | * |
||
| 274 | * @var array |
||
| 275 | * @access public |
||
| 276 | */ |
||
| 277 | public $attributesNS = array('xml:lang', ); |
||
| 278 | |||
| 279 | /** |
||
| 280 | * Constructs class |
||
| 281 | * |
||
| 282 | * @access public |
||
| 283 | */ |
||
| 284 | public function __construct() |
||
| 301 | |||
| 302 | /** |
||
| 303 | * Handles the writing of attributes - called from $this->_openHandler() |
||
| 304 | * |
||
| 305 | * @param array $attrs array of attributes $name => $value |
||
| 306 | * @return boolean |
||
| 307 | * @access private |
||
| 308 | */ |
||
| 309 | private function _writeAttrs ($attrs) |
||
| 403 | |||
| 404 | /** |
||
| 405 | * Opening tag handler - called from HTMLSax |
||
| 406 | * |
||
| 407 | * @param object $parser HTML Parser |
||
| 408 | * @param string $name tag name |
||
| 409 | * @param array $attrs tag attributes |
||
| 410 | * @return boolean |
||
| 411 | * @access private |
||
| 412 | */ |
||
| 413 | public function _openHandler(&$parser, $name, $attrs) |
||
| 477 | |||
| 478 | /** |
||
| 479 | * Closing tag handler - called from HTMLSax |
||
| 480 | * |
||
| 481 | * @param object $parsers HTML parser |
||
| 482 | * @param string $name tag name |
||
| 483 | * @return boolean |
||
| 484 | * @access private |
||
| 485 | */ |
||
| 486 | public function _closeHandler(&$parser, $name) |
||
| 514 | |||
| 515 | /** |
||
| 516 | * Closes tag |
||
| 517 | * |
||
| 518 | * @param string $tag tag name |
||
| 519 | * @return boolean |
||
| 520 | * @access private |
||
| 521 | */ |
||
| 522 | public function _closeTag($tag) |
||
| 539 | |||
| 540 | /** |
||
| 541 | * Character data handler - called from HTMLSax |
||
| 542 | * |
||
| 543 | * @param object $parser HTML parser |
||
| 544 | * @param string $data textual data |
||
| 545 | * @return boolean |
||
| 546 | * @access private |
||
| 547 | */ |
||
| 548 | public function _dataHandler(&$parser, $data) |
||
| 555 | |||
| 556 | /** |
||
| 557 | * Escape handler - called from HTMLSax |
||
| 558 | * |
||
| 559 | * @param object $parser HTML parser |
||
| 560 | * @param string $data comments or other type of data |
||
| 561 | * @return boolean |
||
| 562 | * @access private |
||
| 563 | */ |
||
| 564 | public function _escapeHandler(&$parser, $data) |
||
| 568 | |||
| 569 | /** |
||
| 570 | * Returns the XHTML document |
||
| 571 | * |
||
| 572 | * @return string Processed (X)HTML document |
||
| 573 | * @access public |
||
| 574 | */ |
||
| 575 | public function getXHTML () |
||
| 583 | |||
| 584 | /** |
||
| 585 | * Clears current document data |
||
| 586 | * |
||
| 587 | * @return boolean |
||
| 588 | * @access public |
||
| 589 | */ |
||
| 590 | public function clear() |
||
| 595 | |||
| 596 | /** |
||
| 597 | * Main parsing fuction |
||
| 598 | * |
||
| 599 | * @param string $doc HTML document for processing |
||
| 600 | * @return string Processed (X)HTML document |
||
| 601 | * @access public |
||
| 602 | */ |
||
| 603 | public function parse($doc, $isUTF7=false) |
||
| 635 | |||
| 636 | |||
| 637 | /** |
||
| 638 | * UTF-7 decoding fuction |
||
| 639 | * |
||
| 640 | * @param string $str HTML document for recode ASCII part of UTF-7 back to ASCII |
||
| 641 | * @return string Decoded document |
||
| 642 | * @access private |
||
| 643 | */ |
||
| 644 | private function repackUTF7($str) |
||
| 648 | |||
| 649 | /** |
||
| 650 | * Additional UTF-7 decoding fuction |
||
| 651 | * |
||
| 652 | * @param string $str String for recode ASCII part of UTF-7 back to ASCII |
||
| 653 | * @return string Recoded string |
||
| 654 | * @access private |
||
| 655 | */ |
||
| 656 | private function repackUTF7Callback($str) |
||
| 662 | |||
| 663 | /** |
||
| 664 | * Additional UTF-7 encoding fuction |
||
| 665 | * |
||
| 666 | * @param string $str String for recode ASCII part of UTF-7 back to ASCII |
||
| 667 | * @return string Recoded string |
||
| 668 | * @access private |
||
| 669 | */ |
||
| 670 | private function repackUTF7Back($str) |
||
| 674 | } |
||
| 675 | |||
| 684 |