| Total Complexity | 56 | 
| Total Lines | 303 | 
| Duplicated Lines | 0 % | 
| Changes | 0 | ||
Complex classes like HTMLPurifier_Lexer_DOMLex often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use HTMLPurifier_Lexer_DOMLex, and based on these observations, apply Extract Interface, too.
| 1 | <?php  | 
            ||
| 27 | class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer  | 
            ||
| 28 | { | 
            ||
| 29 | |||
| 30 | /**  | 
            ||
| 31 | * @type HTMLPurifier_TokenFactory  | 
            ||
| 32 | */  | 
            ||
| 33 | private $factory;  | 
            ||
| 34 | |||
| 35 | public function __construct()  | 
            ||
| 40 | }  | 
            ||
| 41 | |||
| 42 | /**  | 
            ||
| 43 | * @param string $html  | 
            ||
| 44 | * @param HTMLPurifier_Config $config  | 
            ||
| 45 | * @param HTMLPurifier_Context $context  | 
            ||
| 46 | * @return HTMLPurifier_Token[]  | 
            ||
| 47 | */  | 
            ||
| 48 | public function tokenizeHTML($html, $config, $context)  | 
            ||
| 49 |     { | 
            ||
| 50 | $html = $this->normalize($html, $config, $context);  | 
            ||
| 51 | |||
| 52 | // attempt to armor stray angled brackets that cannot possibly  | 
            ||
| 53 | // form tags and thus are probably being used as emoticons  | 
            ||
| 54 |         if ($config->get('Core.AggressivelyFixLt')) { | 
            ||
| 55 | $char = '[^a-z!\/]';  | 
            ||
| 56 | $comment = "/<!--(.*?)(-->|\z)/is";  | 
            ||
| 57 | $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);  | 
            ||
| 58 |             do { | 
            ||
| 59 | $old = $html;  | 
            ||
| 60 |                 $html = preg_replace("/<($char)/i", '<\\1', $html); | 
            ||
| 61 | } while ($html !== $old);  | 
            ||
| 62 | $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments  | 
            ||
| 63 | }  | 
            ||
| 64 | |||
| 65 | // preprocess html, essential for UTF-8  | 
            ||
| 66 | $html = $this->wrapHTML($html, $config, $context);  | 
            ||
| 67 | |||
| 68 | $doc = new DOMDocument();  | 
            ||
| 69 | $doc->encoding = 'UTF-8'; // theoretically, the above has this covered  | 
            ||
| 70 | |||
| 71 | $options = 0;  | 
            ||
| 72 |         if ($config->get('Core.AllowParseManyTags') && defined('LIBXML_PARSEHUGE')) { | 
            ||
| 73 | $options |= LIBXML_PARSEHUGE;  | 
            ||
| 74 | }  | 
            ||
| 75 | |||
| 76 | set_error_handler(array($this, 'muteErrorHandler'));  | 
            ||
| 77 | $doc->loadHTML($html, $options);  | 
            ||
| 78 | restore_error_handler();  | 
            ||
| 79 | |||
| 80 |         $body = $doc->getElementsByTagName('html')->item(0)-> // <html> | 
            ||
| 81 |                       getElementsByTagName('body')->item(0);  // <body> | 
            ||
| 82 | |||
| 83 |         $div = $body->getElementsByTagName('div')->item(0); // <div> | 
            ||
| 84 | $tokens = array();  | 
            ||
| 85 | $this->tokenizeDOM($div, $tokens, $config);  | 
            ||
| 86 | // If the div has a sibling, that means we tripped across  | 
            ||
| 87 | // a premature </div> tag. So remove the div we parsed,  | 
            ||
| 88 | // and then tokenize the rest of body. We can't tokenize  | 
            ||
| 89 | // the sibling directly as we'll lose the tags in that case.  | 
            ||
| 90 |         if ($div->nextSibling) { | 
            ||
| 91 | $body->removeChild($div);  | 
            ||
| 92 | $this->tokenizeDOM($body, $tokens, $config);  | 
            ||
| 93 | }  | 
            ||
| 94 | return $tokens;  | 
            ||
| 95 | }  | 
            ||
| 96 | |||
| 97 | /**  | 
            ||
| 98 | * Iterative function that tokenizes a node, putting it into an accumulator.  | 
            ||
| 99 | * To iterate is human, to recurse divine - L. Peter Deutsch  | 
            ||
| 100 | * @param DOMNode $node DOMNode to be tokenized.  | 
            ||
| 101 | * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.  | 
            ||
| 102 | * @return HTMLPurifier_Token of node appended to previously passed tokens.  | 
            ||
| 103 | */  | 
            ||
| 104 | protected function tokenizeDOM($node, &$tokens, $config)  | 
            ||
| 105 |     { | 
            ||
| 106 | $level = 0;  | 
            ||
| 107 | $nodes = array($level => new HTMLPurifier_Queue(array($node)));  | 
            ||
| 108 | $closingNodes = array();  | 
            ||
| 109 |         do { | 
            ||
| 110 |             while (!$nodes[$level]->isEmpty()) { | 
            ||
| 111 | $node = $nodes[$level]->shift(); // FIFO  | 
            ||
| 112 | $collect = $level > 0 ? true : false;  | 
            ||
| 113 | $needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);  | 
            ||
| 114 |                 if ($needEndingTag) { | 
            ||
| 115 | $closingNodes[$level][] = $node;  | 
            ||
| 116 | }  | 
            ||
| 117 |                 if ($node->childNodes && $node->childNodes->length) { | 
            ||
| 118 | $level++;  | 
            ||
| 119 | $nodes[$level] = new HTMLPurifier_Queue();  | 
            ||
| 120 |                     foreach ($node->childNodes as $childNode) { | 
            ||
| 121 | $nodes[$level]->push($childNode);  | 
            ||
| 122 | }  | 
            ||
| 123 | }  | 
            ||
| 124 | }  | 
            ||
| 125 | $level--;  | 
            ||
| 126 |             if ($level && isset($closingNodes[$level])) { | 
            ||
| 127 |                 while ($node = array_pop($closingNodes[$level])) { | 
            ||
| 128 | $this->createEndNode($node, $tokens);  | 
            ||
| 129 | }  | 
            ||
| 130 | }  | 
            ||
| 131 | } while ($level > 0);  | 
            ||
| 132 | }  | 
            ||
| 133 | |||
| 134 | /**  | 
            ||
| 135 | * Portably retrieve the tag name of a node; deals with older versions  | 
            ||
| 136 | * of libxml like 2.7.6  | 
            ||
| 137 | * @param DOMNode $node  | 
            ||
| 138 | */  | 
            ||
| 139 | protected function getTagName($node)  | 
            ||
| 140 |     { | 
            ||
| 141 |         if (isset($node->tagName)) { | 
            ||
| 142 | return $node->tagName;  | 
            ||
| 143 |         } else if (isset($node->nodeName)) { | 
            ||
| 144 | return $node->nodeName;  | 
            ||
| 145 |         } else if (isset($node->localName)) { | 
            ||
| 146 | return $node->localName;  | 
            ||
| 147 | }  | 
            ||
| 148 | return null;  | 
            ||
| 149 | }  | 
            ||
| 150 | |||
| 151 | /**  | 
            ||
| 152 | * Portably retrieve the data of a node; deals with older versions  | 
            ||
| 153 | * of libxml like 2.7.6  | 
            ||
| 154 | * @param DOMNode $node  | 
            ||
| 155 | */  | 
            ||
| 156 | protected function getData($node)  | 
            ||
| 157 |     { | 
            ||
| 158 |         if (isset($node->data)) { | 
            ||
| 159 | return $node->data;  | 
            ||
| 160 |         } else if (isset($node->nodeValue)) { | 
            ||
| 161 | return $node->nodeValue;  | 
            ||
| 162 |         } else if (isset($node->textContent)) { | 
            ||
| 163 | return $node->textContent;  | 
            ||
| 164 | }  | 
            ||
| 165 | return null;  | 
            ||
| 166 | }  | 
            ||
| 167 | |||
| 168 | |||
| 169 | /**  | 
            ||
| 170 | * @param DOMNode $node DOMNode to be tokenized.  | 
            ||
| 171 | * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.  | 
            ||
| 172 | * @param bool $collect Says whether or start and close are collected, set to  | 
            ||
| 173 | * false at first recursion because it's the implicit DIV  | 
            ||
| 174 | * tag you're dealing with.  | 
            ||
| 175 | * @return bool if the token needs an endtoken  | 
            ||
| 176 | * @todo data and tagName properties don't seem to exist in DOMNode?  | 
            ||
| 177 | */  | 
            ||
| 178 | protected function createStartNode($node, &$tokens, $collect, $config)  | 
            ||
| 179 |     { | 
            ||
| 180 | // intercept non element nodes. WE MUST catch all of them,  | 
            ||
| 181 | // but we're not getting the character reference nodes because  | 
            ||
| 182 | // those should have been preprocessed  | 
            ||
| 183 |         if ($node->nodeType === XML_TEXT_NODE) { | 
            ||
| 184 | $data = $this->getData($node); // Handle variable data property  | 
            ||
| 185 |             if ($data !== null) { | 
            ||
| 186 | $tokens[] = $this->factory->createText($data);  | 
            ||
| 187 | }  | 
            ||
| 188 | return false;  | 
            ||
| 189 |         } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { | 
            ||
| 190 | // undo libxml's special treatment of <script> and <style> tags  | 
            ||
| 191 | $last = end($tokens);  | 
            ||
| 192 | $data = $node->data;  | 
            ||
| 193 | // (note $node->tagname is already normalized)  | 
            ||
| 194 |             if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) { | 
            ||
| 195 | $new_data = trim($data);  | 
            ||
| 196 |                 if (substr($new_data, 0, 4) === '<!--') { | 
            ||
| 197 | $data = substr($new_data, 4);  | 
            ||
| 198 |                     if (substr($data, -3) === '-->') { | 
            ||
| 199 | $data = substr($data, 0, -3);  | 
            ||
| 200 |                     } else { | 
            ||
| 201 | // Highly suspicious! Not sure what to do...  | 
            ||
| 202 | }  | 
            ||
| 203 | }  | 
            ||
| 204 | }  | 
            ||
| 205 | $tokens[] = $this->factory->createText($this->parseText($data, $config));  | 
            ||
| 206 | return false;  | 
            ||
| 207 |         } elseif ($node->nodeType === XML_COMMENT_NODE) { | 
            ||
| 208 | // this is code is only invoked for comments in script/style in versions  | 
            ||
| 209 | // of libxml pre-2.6.28 (regular comments, of course, are still  | 
            ||
| 210 | // handled regularly)  | 
            ||
| 211 | $tokens[] = $this->factory->createComment($node->data);  | 
            ||
| 212 | return false;  | 
            ||
| 213 |         } elseif ($node->nodeType !== XML_ELEMENT_NODE) { | 
            ||
| 214 | // not-well tested: there may be other nodes we have to grab  | 
            ||
| 215 | return false;  | 
            ||
| 216 | }  | 
            ||
| 217 | $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();  | 
            ||
| 218 | $tag_name = $this->getTagName($node); // Handle variable tagName property  | 
            ||
| 219 |         if (empty($tag_name)) { | 
            ||
| 220 | return (bool) $node->childNodes->length;  | 
            ||
| 221 | }  | 
            ||
| 222 | // We still have to make sure that the element actually IS empty  | 
            ||
| 223 |         if (!$node->childNodes->length) { | 
            ||
| 224 |             if ($collect) { | 
            ||
| 225 | $tokens[] = $this->factory->createEmpty($tag_name, $attr);  | 
            ||
| 226 | }  | 
            ||
| 227 | return false;  | 
            ||
| 228 |         } else { | 
            ||
| 229 |             if ($collect) { | 
            ||
| 230 | $tokens[] = $this->factory->createStart($tag_name, $attr);  | 
            ||
| 231 | }  | 
            ||
| 232 | return true;  | 
            ||
| 233 | }  | 
            ||
| 234 | }  | 
            ||
| 235 | |||
| 236 | /**  | 
            ||
| 237 | * @param DOMNode $node  | 
            ||
| 238 | * @param HTMLPurifier_Token[] $tokens  | 
            ||
| 239 | */  | 
            ||
| 240 | protected function createEndNode($node, &$tokens)  | 
            ||
| 241 |     { | 
            ||
| 242 | $tag_name = $this->getTagName($node); // Handle variable tagName property  | 
            ||
| 243 | $tokens[] = $this->factory->createEnd($tag_name);  | 
            ||
| 244 | }  | 
            ||
| 245 | |||
| 246 | /**  | 
            ||
| 247 | * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.  | 
            ||
| 248 | *  | 
            ||
| 249 | * @param DOMNamedNodeMap $node_map DOMNamedNodeMap of DOMAttr objects.  | 
            ||
| 250 | * @return array Associative array of attributes.  | 
            ||
| 251 | */  | 
            ||
| 252 | protected function transformAttrToAssoc($node_map)  | 
            ||
| 253 |     { | 
            ||
| 254 | // NamedNodeMap is documented very well, so we're using undocumented  | 
            ||
| 255 | // features, namely, the fact that it implements Iterator and  | 
            ||
| 256 | // has a ->length attribute  | 
            ||
| 257 |         if ($node_map->length === 0) { | 
            ||
| 258 | return array();  | 
            ||
| 259 | }  | 
            ||
| 260 | $array = array();  | 
            ||
| 261 |         foreach ($node_map as $attr) { | 
            ||
| 262 | $array[$attr->name] = $attr->value;  | 
            ||
| 263 | }  | 
            ||
| 264 | return $array;  | 
            ||
| 265 | }  | 
            ||
| 266 | |||
| 267 | /**  | 
            ||
| 268 | * An error handler that mutes all errors  | 
            ||
| 269 | * @param int $errno  | 
            ||
| 270 | * @param string $errstr  | 
            ||
| 271 | */  | 
            ||
| 272 | public function muteErrorHandler($errno, $errstr)  | 
            ||
| 273 |     { | 
            ||
| 274 | }  | 
            ||
| 275 | |||
| 276 | /**  | 
            ||
| 277 | * Callback function for undoing escaping of stray angled brackets  | 
            ||
| 278 | * in comments  | 
            ||
| 279 | * @param array $matches  | 
            ||
| 280 | * @return string  | 
            ||
| 281 | */  | 
            ||
| 282 | public function callbackUndoCommentSubst($matches)  | 
            ||
| 283 |     { | 
            ||
| 284 |         return '<!--' . strtr($matches[1], array('&' => '&', '<' => '<')) . $matches[2]; | 
            ||
| 285 | }  | 
            ||
| 286 | |||
| 287 | /**  | 
            ||
| 288 | * Callback function that entity-izes ampersands in comments so that  | 
            ||
| 289 | * callbackUndoCommentSubst doesn't clobber them  | 
            ||
| 290 | * @param array $matches  | 
            ||
| 291 | * @return string  | 
            ||
| 292 | */  | 
            ||
| 293 | public function callbackArmorCommentEntities($matches)  | 
            ||
| 296 | }  | 
            ||
| 297 | |||
| 298 | /**  | 
            ||
| 299 | * Wraps an HTML fragment in the necessary HTML  | 
            ||
| 300 | * @param string $html  | 
            ||
| 301 | * @param HTMLPurifier_Config $config  | 
            ||
| 302 | * @param HTMLPurifier_Context $context  | 
            ||
| 303 | * @return string  | 
            ||
| 304 | */  | 
            ||
| 305 | protected function wrapHTML($html, $config, $context, $use_div = true)  | 
            ||
| 330 | }  | 
            ||
| 331 | }  | 
            ||
| 332 | |||
| 333 | // vim: et sw=4 sts=4  | 
            ||
| 334 | 
This check looks for parameters that have been defined for a function or method, but which are not used in the method body.