Complex classes like TSafeHtmlParser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use TSafeHtmlParser, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
56 | class TSafeHtmlParser |
||
57 | { |
||
58 | /** |
||
59 | * Storage for resulting HTML output |
||
60 | * |
||
61 | * @var string |
||
62 | * @access private |
||
63 | */ |
||
64 | private $_xhtml = ''; |
||
65 | |||
66 | /** |
||
67 | * Array of counters for each tag |
||
68 | * |
||
69 | * @var array |
||
70 | * @access private |
||
71 | */ |
||
72 | private $_counter = array(); |
||
73 | |||
74 | /** |
||
75 | * Stack of unclosed tags |
||
76 | * |
||
77 | * @var array |
||
78 | * @access private |
||
79 | */ |
||
80 | private $_stack = array(); |
||
81 | |||
82 | /** |
||
83 | * Array of counters for tags that must be deleted with all content |
||
84 | * |
||
85 | * @var array |
||
86 | * @access private |
||
87 | */ |
||
88 | private $_dcCounter = array(); |
||
89 | |||
90 | /** |
||
91 | * Stack of unclosed tags that must be deleted with all content |
||
92 | * |
||
93 | * @var array |
||
94 | * @access private |
||
95 | */ |
||
96 | private $_dcStack = array(); |
||
97 | |||
98 | /** |
||
99 | * Stores level of list (ol/ul) nesting |
||
100 | * |
||
101 | * @var int |
||
102 | * @access private |
||
103 | */ |
||
104 | private $_listScope = 0; |
||
105 | |||
106 | /** |
||
107 | * Stack of unclosed list tags |
||
108 | * |
||
109 | * @var array |
||
110 | * @access private |
||
111 | */ |
||
112 | private $_liStack = array(); |
||
113 | |||
114 | /** |
||
115 | * Array of prepared regular expressions for protocols (schemas) matching |
||
116 | * |
||
117 | * @var array |
||
118 | * @access private |
||
119 | */ |
||
120 | private $_protoRegexps = array(); |
||
121 | |||
122 | /** |
||
123 | * Array of prepared regular expressions for CSS matching |
||
124 | * |
||
125 | * @var array |
||
126 | * @access private |
||
127 | */ |
||
128 | private $_cssRegexps = array(); |
||
129 | |||
130 | /** |
||
131 | * List of single tags ("<tag />") |
||
132 | * |
||
133 | * @var array |
||
134 | * @access public |
||
135 | */ |
||
136 | public $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', ); |
||
137 | |||
138 | /** |
||
139 | * List of dangerous tags (such tags will be deleted) |
||
140 | * |
||
141 | * @var array |
||
142 | * @access public |
||
143 | */ |
||
144 | public $deleteTags = array( |
||
145 | 'applet', 'base', 'basefont', 'bgsound', 'blink', 'body', |
||
146 | 'embed', 'frame', 'frameset', 'head', 'html', 'ilayer', |
||
147 | 'iframe', 'layer', 'link', 'meta', 'object', 'style', |
||
148 | 'title', 'script', |
||
149 | ); |
||
150 | |||
151 | /** |
||
152 | * List of dangerous tags (such tags will be deleted, and all content |
||
153 | * inside this tags will be also removed) |
||
154 | * |
||
155 | * @var array |
||
156 | * @access public |
||
157 | */ |
||
158 | public $deleteTagsContent = array('script', 'style', 'title', 'xml', ); |
||
159 | |||
160 | /** |
||
161 | * Type of protocols filtering ('white' or 'black') |
||
162 | * |
||
163 | * @var string |
||
164 | * @access public |
||
165 | */ |
||
166 | public $protocolFiltering = 'white'; |
||
167 | |||
168 | /** |
||
169 | * List of "dangerous" protocols (used for blacklist-filtering) |
||
170 | * |
||
171 | * @var array |
||
172 | * @access public |
||
173 | */ |
||
174 | public $blackProtocols = array( |
||
175 | 'about', 'chrome', 'data', 'disk', 'hcp', |
||
176 | 'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec', |
||
177 | 'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera', |
||
178 | 'res', 'resource', 'shell', 'vbscript', 'view-source', |
||
179 | 'vnd.ms.radio', 'wysiwyg', |
||
180 | ); |
||
181 | |||
182 | /** |
||
183 | * List of "safe" protocols (used for whitelist-filtering) |
||
184 | * |
||
185 | * @var array |
||
186 | * @access public |
||
187 | */ |
||
188 | public $whiteProtocols = array( |
||
189 | 'ed2k', 'file', 'ftp', 'gopher', 'http', 'https', |
||
190 | 'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal', |
||
191 | 'xmpp', 'callto', |
||
192 | ); |
||
193 | |||
194 | /** |
||
195 | * List of attributes that can contain protocols |
||
196 | * |
||
197 | * @var array |
||
198 | * @access public |
||
199 | */ |
||
200 | public $protocolAttributes = array( |
||
201 | 'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src', |
||
202 | ); |
||
203 | |||
204 | /** |
||
205 | * List of dangerous CSS keywords |
||
206 | * |
||
207 | * Whole style="" attribute will be removed, if parser will find one of |
||
208 | * these keywords |
||
209 | * |
||
210 | * @var array |
||
211 | * @access public |
||
212 | */ |
||
213 | public $cssKeywords = array( |
||
214 | 'absolute', 'behavior', 'behaviour', 'content', 'expression', |
||
215 | 'fixed', 'include-source', 'moz-binding', |
||
216 | ); |
||
217 | |||
218 | /** |
||
219 | * List of tags that can have no "closing tag" |
||
220 | * |
||
221 | * @var array |
||
222 | * @access public |
||
223 | * @deprecated XHTML does not allow such tags |
||
224 | */ |
||
225 | public $noClose = array(); |
||
226 | |||
227 | /** |
||
228 | * List of block-level tags that terminates paragraph |
||
229 | * |
||
230 | * Paragraph will be closed when this tags opened |
||
231 | * |
||
232 | * @var array |
||
233 | * @access public |
||
234 | */ |
||
235 | public $closeParagraph = array( |
||
236 | 'address', 'blockquote', 'center', 'dd', 'dir', 'div', |
||
237 | 'dl', 'dt', 'h1', 'h2', 'h3', 'h4', |
||
238 | 'h5', 'h6', 'hr', 'isindex', 'listing', 'marquee', |
||
239 | 'menu', 'multicol', 'ol', 'p', 'plaintext', 'pre', |
||
240 | 'table', 'ul', 'xmp', |
||
241 | ); |
||
242 | |||
243 | /** |
||
244 | * List of table tags, all table tags outside a table will be removed |
||
245 | * |
||
246 | * @var array |
||
247 | * @access public |
||
248 | */ |
||
249 | public $tableTags = array( |
||
250 | 'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', |
||
251 | 'thead', 'tr', |
||
252 | ); |
||
253 | |||
254 | /** |
||
255 | * List of list tags |
||
256 | * |
||
257 | * @var array |
||
258 | * @access public |
||
259 | */ |
||
260 | public $listTags = array('dir', 'menu', 'ol', 'ul', 'dl', ); |
||
261 | |||
262 | /** |
||
263 | * List of dangerous attributes |
||
264 | * |
||
265 | * @var array |
||
266 | * @access public |
||
267 | */ |
||
268 | public $attributes = array('dynsrc'); |
||
269 | //public $attributes = array('dynsrc', 'id', 'name', ); //id and name are dangerous? |
||
270 | |||
271 | /** |
||
272 | * List of allowed "namespaced" attributes |
||
273 | * |
||
274 | * @var array |
||
275 | * @access public |
||
276 | */ |
||
277 | public $attributesNS = array('xml:lang', ); |
||
278 | |||
279 | /** |
||
280 | * Constructs class |
||
281 | * |
||
282 | * @access public |
||
283 | */ |
||
284 | public function __construct() |
||
301 | |||
302 | /** |
||
303 | * Handles the writing of attributes - called from $this->_openHandler() |
||
304 | * |
||
305 | * @param array $attrs array of attributes $name => $value |
||
306 | * @return boolean |
||
307 | * @access private |
||
308 | */ |
||
309 | private function _writeAttrs ($attrs) |
||
403 | |||
404 | /** |
||
405 | * Opening tag handler - called from HTMLSax |
||
406 | * |
||
407 | * @param object $parser HTML Parser |
||
408 | * @param string $name tag name |
||
409 | * @param array $attrs tag attributes |
||
410 | * @return boolean |
||
411 | * @access private |
||
412 | */ |
||
413 | public function _openHandler(&$parser, $name, $attrs) |
||
477 | |||
478 | /** |
||
479 | * Closing tag handler - called from HTMLSax |
||
480 | * |
||
481 | * @param object $parsers HTML parser |
||
482 | * @param string $name tag name |
||
483 | * @return boolean |
||
484 | * @access private |
||
485 | */ |
||
486 | public function _closeHandler(&$parser, $name) |
||
514 | |||
515 | /** |
||
516 | * Closes tag |
||
517 | * |
||
518 | * @param string $tag tag name |
||
519 | * @return boolean |
||
520 | * @access private |
||
521 | */ |
||
522 | public function _closeTag($tag) |
||
539 | |||
540 | /** |
||
541 | * Character data handler - called from HTMLSax |
||
542 | * |
||
543 | * @param object $parser HTML parser |
||
544 | * @param string $data textual data |
||
545 | * @return boolean |
||
546 | * @access private |
||
547 | */ |
||
548 | public function _dataHandler(&$parser, $data) |
||
555 | |||
556 | /** |
||
557 | * Escape handler - called from HTMLSax |
||
558 | * |
||
559 | * @param object $parser HTML parser |
||
560 | * @param string $data comments or other type of data |
||
561 | * @return boolean |
||
562 | * @access private |
||
563 | */ |
||
564 | public function _escapeHandler(&$parser, $data) |
||
568 | |||
569 | /** |
||
570 | * Returns the XHTML document |
||
571 | * |
||
572 | * @return string Processed (X)HTML document |
||
573 | * @access public |
||
574 | */ |
||
575 | public function getXHTML () |
||
583 | |||
584 | /** |
||
585 | * Clears current document data |
||
586 | * |
||
587 | * @return boolean |
||
588 | * @access public |
||
589 | */ |
||
590 | public function clear() |
||
595 | |||
596 | /** |
||
597 | * Main parsing fuction |
||
598 | * |
||
599 | * @param string $doc HTML document for processing |
||
600 | * @return string Processed (X)HTML document |
||
601 | * @access public |
||
602 | */ |
||
603 | public function parse($doc, $isUTF7=false) |
||
635 | |||
636 | |||
637 | /** |
||
638 | * UTF-7 decoding fuction |
||
639 | * |
||
640 | * @param string $str HTML document for recode ASCII part of UTF-7 back to ASCII |
||
641 | * @return string Decoded document |
||
642 | * @access private |
||
643 | */ |
||
644 | private function repackUTF7($str) |
||
648 | |||
649 | /** |
||
650 | * Additional UTF-7 decoding fuction |
||
651 | * |
||
652 | * @param string $str String for recode ASCII part of UTF-7 back to ASCII |
||
653 | * @return string Recoded string |
||
654 | * @access private |
||
655 | */ |
||
656 | private function repackUTF7Callback($str) |
||
662 | |||
663 | /** |
||
664 | * Additional UTF-7 encoding fuction |
||
665 | * |
||
666 | * @param string $str String for recode ASCII part of UTF-7 back to ASCII |
||
667 | * @return string Recoded string |
||
668 | * @access private |
||
669 | */ |
||
670 | private function repackUTF7Back($str) |
||
674 | } |
||
675 | |||
684 |