Complex classes like TSafeHtmlParser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use TSafeHtmlParser, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
58 | class TSafeHtmlParser |
||
59 | { |
||
60 | /** |
||
61 | * Storage for resulting HTML output |
||
62 | * |
||
63 | * @var string |
||
64 | * @access private |
||
65 | */ |
||
66 | private $_xhtml = ''; |
||
67 | |||
68 | /** |
||
69 | * Array of counters for each tag |
||
70 | * |
||
71 | * @var array |
||
72 | * @access private |
||
73 | */ |
||
74 | private $_counter = array(); |
||
75 | |||
76 | /** |
||
77 | * Stack of unclosed tags |
||
78 | * |
||
79 | * @var array |
||
80 | * @access private |
||
81 | */ |
||
82 | private $_stack = array(); |
||
83 | |||
84 | /** |
||
85 | * Array of counters for tags that must be deleted with all content |
||
86 | * |
||
87 | * @var array |
||
88 | * @access private |
||
89 | */ |
||
90 | private $_dcCounter = array(); |
||
91 | |||
92 | /** |
||
93 | * Stack of unclosed tags that must be deleted with all content |
||
94 | * |
||
95 | * @var array |
||
96 | * @access private |
||
97 | */ |
||
98 | private $_dcStack = array(); |
||
99 | |||
100 | /** |
||
101 | * Stores level of list (ol/ul) nesting |
||
102 | * |
||
103 | * @var int |
||
104 | * @access private |
||
105 | */ |
||
106 | private $_listScope = 0; |
||
107 | |||
108 | /** |
||
109 | * Stack of unclosed list tags |
||
110 | * |
||
111 | * @var array |
||
112 | * @access private |
||
113 | */ |
||
114 | private $_liStack = array(); |
||
115 | |||
116 | /** |
||
117 | * Array of prepared regular expressions for protocols (schemas) matching |
||
118 | * |
||
119 | * @var array |
||
120 | * @access private |
||
121 | */ |
||
122 | private $_protoRegexps = array(); |
||
123 | |||
124 | /** |
||
125 | * Array of prepared regular expressions for CSS matching |
||
126 | * |
||
127 | * @var array |
||
128 | * @access private |
||
129 | */ |
||
130 | private $_cssRegexps = array(); |
||
131 | |||
132 | /** |
||
133 | * List of single tags ("<tag />") |
||
134 | * |
||
135 | * @var array |
||
136 | * @access public |
||
137 | */ |
||
138 | public $singleTags = array('area', 'br', 'img', 'input', 'hr', 'wbr', ); |
||
139 | |||
140 | /** |
||
141 | * List of dangerous tags (such tags will be deleted) |
||
142 | * |
||
143 | * @var array |
||
144 | * @access public |
||
145 | */ |
||
146 | public $deleteTags = array( |
||
147 | 'applet', 'base', 'basefont', 'bgsound', 'blink', 'body', |
||
148 | 'embed', 'frame', 'frameset', 'head', 'html', 'ilayer', |
||
149 | 'iframe', 'layer', 'link', 'meta', 'object', 'style', |
||
150 | 'title', 'script', |
||
151 | ); |
||
152 | |||
153 | /** |
||
154 | * List of dangerous tags (such tags will be deleted, and all content |
||
155 | * inside this tags will be also removed) |
||
156 | * |
||
157 | * @var array |
||
158 | * @access public |
||
159 | */ |
||
160 | public $deleteTagsContent = array('script', 'style', 'title', 'xml', ); |
||
161 | |||
162 | /** |
||
163 | * Type of protocols filtering ('white' or 'black') |
||
164 | * |
||
165 | * @var string |
||
166 | * @access public |
||
167 | */ |
||
168 | public $protocolFiltering = 'white'; |
||
169 | |||
170 | /** |
||
171 | * List of "dangerous" protocols (used for blacklist-filtering) |
||
172 | * |
||
173 | * @var array |
||
174 | * @access public |
||
175 | */ |
||
176 | public $blackProtocols = array( |
||
177 | 'about', 'chrome', 'data', 'disk', 'hcp', |
||
178 | 'help', 'javascript', 'livescript', 'lynxcgi', 'lynxexec', |
||
179 | 'ms-help', 'ms-its', 'mhtml', 'mocha', 'opera', |
||
180 | 'res', 'resource', 'shell', 'vbscript', 'view-source', |
||
181 | 'vnd.ms.radio', 'wysiwyg', |
||
182 | ); |
||
183 | |||
184 | /** |
||
185 | * List of "safe" protocols (used for whitelist-filtering) |
||
186 | * |
||
187 | * @var array |
||
188 | * @access public |
||
189 | */ |
||
190 | public $whiteProtocols = array( |
||
191 | 'ed2k', 'file', 'ftp', 'gopher', 'http', 'https', |
||
192 | 'irc', 'mailto', 'news', 'nntp', 'telnet', 'webcal', |
||
193 | 'xmpp', 'callto', |
||
194 | ); |
||
195 | |||
196 | /** |
||
197 | * List of attributes that can contain protocols |
||
198 | * |
||
199 | * @var array |
||
200 | * @access public |
||
201 | */ |
||
202 | public $protocolAttributes = array( |
||
203 | 'action', 'background', 'codebase', 'dynsrc', 'href', 'lowsrc', 'src', |
||
204 | ); |
||
205 | |||
206 | /** |
||
207 | * List of dangerous CSS keywords |
||
208 | * |
||
209 | * Whole style="" attribute will be removed, if parser will find one of |
||
210 | * these keywords |
||
211 | * |
||
212 | * @var array |
||
213 | * @access public |
||
214 | */ |
||
215 | public $cssKeywords = array( |
||
216 | 'absolute', 'behavior', 'behaviour', 'content', 'expression', |
||
217 | 'fixed', 'include-source', 'moz-binding', |
||
218 | ); |
||
219 | |||
220 | /** |
||
221 | * List of tags that can have no "closing tag" |
||
222 | * |
||
223 | * @var array |
||
224 | * @access public |
||
225 | * @deprecated XHTML does not allow such tags |
||
226 | */ |
||
227 | public $noClose = array(); |
||
228 | |||
229 | /** |
||
230 | * List of block-level tags that terminates paragraph |
||
231 | * |
||
232 | * Paragraph will be closed when this tags opened |
||
233 | * |
||
234 | * @var array |
||
235 | * @access public |
||
236 | */ |
||
237 | public $closeParagraph = array( |
||
238 | 'address', 'blockquote', 'center', 'dd', 'dir', 'div', |
||
239 | 'dl', 'dt', 'h1', 'h2', 'h3', 'h4', |
||
240 | 'h5', 'h6', 'hr', 'isindex', 'listing', 'marquee', |
||
241 | 'menu', 'multicol', 'ol', 'p', 'plaintext', 'pre', |
||
242 | 'table', 'ul', 'xmp', |
||
243 | ); |
||
244 | |||
245 | /** |
||
246 | * List of table tags, all table tags outside a table will be removed |
||
247 | * |
||
248 | * @var array |
||
249 | * @access public |
||
250 | */ |
||
251 | public $tableTags = array( |
||
252 | 'caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th', |
||
253 | 'thead', 'tr', |
||
254 | ); |
||
255 | |||
256 | /** |
||
257 | * List of list tags |
||
258 | * |
||
259 | * @var array |
||
260 | * @access public |
||
261 | */ |
||
262 | public $listTags = array('dir', 'menu', 'ol', 'ul', 'dl', ); |
||
263 | |||
264 | /** |
||
265 | * List of dangerous attributes |
||
266 | * |
||
267 | * @var array |
||
268 | * @access public |
||
269 | */ |
||
270 | public $attributes = array('dynsrc'); |
||
271 | //public $attributes = array('dynsrc', 'id', 'name', ); //id and name are dangerous? |
||
272 | |||
273 | /** |
||
274 | * List of allowed "namespaced" attributes |
||
275 | * |
||
276 | * @var array |
||
277 | * @access public |
||
278 | */ |
||
279 | public $attributesNS = array('xml:lang', ); |
||
280 | |||
281 | /** |
||
282 | * Constructs class |
||
283 | * |
||
284 | * @access public |
||
285 | */ |
||
286 | public function __construct() |
||
303 | |||
304 | /** |
||
305 | * Handles the writing of attributes - called from $this->_openHandler() |
||
306 | * |
||
307 | * @param array $attrs array of attributes $name => $value |
||
308 | * @return boolean |
||
309 | * @access private |
||
310 | */ |
||
311 | private function _writeAttrs ($attrs) |
||
405 | |||
406 | /** |
||
407 | * Opening tag handler - called from HTMLSax |
||
408 | * |
||
409 | * @param object $parser HTML Parser |
||
410 | * @param string $name tag name |
||
411 | * @param array $attrs tag attributes |
||
412 | * @return boolean |
||
413 | * @access private |
||
414 | */ |
||
415 | public function _openHandler(&$parser, $name, $attrs) |
||
479 | |||
480 | /** |
||
481 | * Closing tag handler - called from HTMLSax |
||
482 | * |
||
483 | * @param object $parsers HTML parser |
||
484 | * @param string $name tag name |
||
485 | * @return boolean |
||
486 | * @access private |
||
487 | */ |
||
488 | public function _closeHandler(&$parser, $name) |
||
516 | |||
517 | /** |
||
518 | * Closes tag |
||
519 | * |
||
520 | * @param string $tag tag name |
||
521 | * @return boolean |
||
522 | * @access private |
||
523 | */ |
||
524 | public function _closeTag($tag) |
||
541 | |||
542 | /** |
||
543 | * Character data handler - called from HTMLSax |
||
544 | * |
||
545 | * @param object $parser HTML parser |
||
546 | * @param string $data textual data |
||
547 | * @return boolean |
||
548 | * @access private |
||
549 | */ |
||
550 | public function _dataHandler(&$parser, $data) |
||
557 | |||
558 | /** |
||
559 | * Escape handler - called from HTMLSax |
||
560 | * |
||
561 | * @param object $parser HTML parser |
||
562 | * @param string $data comments or other type of data |
||
563 | * @return boolean |
||
564 | * @access private |
||
565 | */ |
||
566 | public function _escapeHandler(&$parser, $data) |
||
570 | |||
571 | /** |
||
572 | * Returns the XHTML document |
||
573 | * |
||
574 | * @return string Processed (X)HTML document |
||
575 | * @access public |
||
576 | */ |
||
577 | public function getXHTML () |
||
585 | |||
586 | /** |
||
587 | * Clears current document data |
||
588 | * |
||
589 | * @return boolean |
||
590 | * @access public |
||
591 | */ |
||
592 | public function clear() |
||
597 | |||
598 | /** |
||
599 | * Main parsing fuction |
||
600 | * |
||
601 | * @param string $doc HTML document for processing |
||
602 | * @return string Processed (X)HTML document |
||
603 | * @access public |
||
604 | */ |
||
605 | public function parse($doc, $isUTF7=false) |
||
637 | |||
638 | |||
639 | /** |
||
640 | * UTF-7 decoding fuction |
||
641 | * |
||
642 | * @param string $str HTML document for recode ASCII part of UTF-7 back to ASCII |
||
643 | * @return string Decoded document |
||
644 | * @access private |
||
645 | */ |
||
646 | private function repackUTF7($str) |
||
650 | |||
651 | /** |
||
652 | * Additional UTF-7 decoding fuction |
||
653 | * |
||
654 | * @param string $str String for recode ASCII part of UTF-7 back to ASCII |
||
655 | * @return string Recoded string |
||
656 | * @access private |
||
657 | */ |
||
658 | private function repackUTF7Callback($str) |
||
664 | |||
665 | /** |
||
666 | * Additional UTF-7 encoding fuction |
||
667 | * |
||
668 | * @param string $str String for recode ASCII part of UTF-7 back to ASCII |
||
669 | * @return string Recoded string |
||
670 | * @access private |
||
671 | */ |
||
672 | private function repackUTF7Back($str) |
||
676 | } |
||
677 | |||
686 |