| Total Complexity | 56 |
| Total Lines | 303 |
| Duplicated Lines | 0 % |
| Changes | 0 | ||
Complex classes like HTMLPurifier_Lexer_DOMLex often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use HTMLPurifier_Lexer_DOMLex, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 27 | class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer |
||
| 28 | { |
||
| 29 | |||
| 30 | /** |
||
| 31 | * @type HTMLPurifier_TokenFactory |
||
| 32 | */ |
||
| 33 | private $factory; |
||
| 34 | |||
| 35 | public function __construct() |
||
| 40 | } |
||
| 41 | |||
| 42 | /** |
||
| 43 | * @param string $html |
||
| 44 | * @param HTMLPurifier_Config $config |
||
| 45 | * @param HTMLPurifier_Context $context |
||
| 46 | * @return HTMLPurifier_Token[] |
||
| 47 | */ |
||
| 48 | public function tokenizeHTML($html, $config, $context) |
||
| 49 | { |
||
| 50 | $html = $this->normalize($html, $config, $context); |
||
| 51 | |||
| 52 | // attempt to armor stray angled brackets that cannot possibly |
||
| 53 | // form tags and thus are probably being used as emoticons |
||
| 54 | if ($config->get('Core.AggressivelyFixLt')) { |
||
| 55 | $char = '[^a-z!\/]'; |
||
| 56 | $comment = "/<!--(.*?)(-->|\z)/is"; |
||
| 57 | $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html); |
||
| 58 | do { |
||
| 59 | $old = $html; |
||
| 60 | $html = preg_replace("/<($char)/i", '<\\1', $html); |
||
| 61 | } while ($html !== $old); |
||
| 62 | $html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments |
||
| 63 | } |
||
| 64 | |||
| 65 | // preprocess html, essential for UTF-8 |
||
| 66 | $html = $this->wrapHTML($html, $config, $context); |
||
| 67 | |||
| 68 | $doc = new DOMDocument(); |
||
| 69 | $doc->encoding = 'UTF-8'; // theoretically, the above has this covered |
||
| 70 | |||
| 71 | $options = 0; |
||
| 72 | if ($config->get('Core.AllowParseManyTags') && defined('LIBXML_PARSEHUGE')) { |
||
| 73 | $options |= LIBXML_PARSEHUGE; |
||
| 74 | } |
||
| 75 | |||
| 76 | set_error_handler(array($this, 'muteErrorHandler')); |
||
| 77 | $doc->loadHTML($html, $options); |
||
| 78 | restore_error_handler(); |
||
| 79 | |||
| 80 | $body = $doc->getElementsByTagName('html')->item(0)-> // <html> |
||
| 81 | getElementsByTagName('body')->item(0); // <body> |
||
| 82 | |||
| 83 | $div = $body->getElementsByTagName('div')->item(0); // <div> |
||
| 84 | $tokens = array(); |
||
| 85 | $this->tokenizeDOM($div, $tokens, $config); |
||
| 86 | // If the div has a sibling, that means we tripped across |
||
| 87 | // a premature </div> tag. So remove the div we parsed, |
||
| 88 | // and then tokenize the rest of body. We can't tokenize |
||
| 89 | // the sibling directly as we'll lose the tags in that case. |
||
| 90 | if ($div->nextSibling) { |
||
| 91 | $body->removeChild($div); |
||
| 92 | $this->tokenizeDOM($body, $tokens, $config); |
||
| 93 | } |
||
| 94 | return $tokens; |
||
| 95 | } |
||
| 96 | |||
| 97 | /** |
||
| 98 | * Iterative function that tokenizes a node, putting it into an accumulator. |
||
| 99 | * To iterate is human, to recurse divine - L. Peter Deutsch |
||
| 100 | * @param DOMNode $node DOMNode to be tokenized. |
||
| 101 | * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens. |
||
| 102 | * @return HTMLPurifier_Token of node appended to previously passed tokens. |
||
| 103 | */ |
||
| 104 | protected function tokenizeDOM($node, &$tokens, $config) |
||
| 105 | { |
||
| 106 | $level = 0; |
||
| 107 | $nodes = array($level => new HTMLPurifier_Queue(array($node))); |
||
| 108 | $closingNodes = array(); |
||
| 109 | do { |
||
| 110 | while (!$nodes[$level]->isEmpty()) { |
||
| 111 | $node = $nodes[$level]->shift(); // FIFO |
||
| 112 | $collect = $level > 0 ? true : false; |
||
| 113 | $needEndingTag = $this->createStartNode($node, $tokens, $collect, $config); |
||
| 114 | if ($needEndingTag) { |
||
| 115 | $closingNodes[$level][] = $node; |
||
| 116 | } |
||
| 117 | if ($node->childNodes && $node->childNodes->length) { |
||
| 118 | $level++; |
||
| 119 | $nodes[$level] = new HTMLPurifier_Queue(); |
||
| 120 | foreach ($node->childNodes as $childNode) { |
||
| 121 | $nodes[$level]->push($childNode); |
||
| 122 | } |
||
| 123 | } |
||
| 124 | } |
||
| 125 | $level--; |
||
| 126 | if ($level && isset($closingNodes[$level])) { |
||
| 127 | while ($node = array_pop($closingNodes[$level])) { |
||
| 128 | $this->createEndNode($node, $tokens); |
||
| 129 | } |
||
| 130 | } |
||
| 131 | } while ($level > 0); |
||
| 132 | } |
||
| 133 | |||
| 134 | /** |
||
| 135 | * Portably retrieve the tag name of a node; deals with older versions |
||
| 136 | * of libxml like 2.7.6 |
||
| 137 | * @param DOMNode $node |
||
| 138 | */ |
||
| 139 | protected function getTagName($node) |
||
| 140 | { |
||
| 141 | if (isset($node->tagName)) { |
||
| 142 | return $node->tagName; |
||
| 143 | } else if (isset($node->nodeName)) { |
||
| 144 | return $node->nodeName; |
||
| 145 | } else if (isset($node->localName)) { |
||
| 146 | return $node->localName; |
||
| 147 | } |
||
| 148 | return null; |
||
| 149 | } |
||
| 150 | |||
| 151 | /** |
||
| 152 | * Portably retrieve the data of a node; deals with older versions |
||
| 153 | * of libxml like 2.7.6 |
||
| 154 | * @param DOMNode $node |
||
| 155 | */ |
||
| 156 | protected function getData($node) |
||
| 157 | { |
||
| 158 | if (isset($node->data)) { |
||
| 159 | return $node->data; |
||
| 160 | } else if (isset($node->nodeValue)) { |
||
| 161 | return $node->nodeValue; |
||
| 162 | } else if (isset($node->textContent)) { |
||
| 163 | return $node->textContent; |
||
| 164 | } |
||
| 165 | return null; |
||
| 166 | } |
||
| 167 | |||
| 168 | |||
| 169 | /** |
||
| 170 | * @param DOMNode $node DOMNode to be tokenized. |
||
| 171 | * @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens. |
||
| 172 | * @param bool $collect Says whether or start and close are collected, set to |
||
| 173 | * false at first recursion because it's the implicit DIV |
||
| 174 | * tag you're dealing with. |
||
| 175 | * @return bool if the token needs an endtoken |
||
| 176 | * @todo data and tagName properties don't seem to exist in DOMNode? |
||
| 177 | */ |
||
| 178 | protected function createStartNode($node, &$tokens, $collect, $config) |
||
| 179 | { |
||
| 180 | // intercept non element nodes. WE MUST catch all of them, |
||
| 181 | // but we're not getting the character reference nodes because |
||
| 182 | // those should have been preprocessed |
||
| 183 | if ($node->nodeType === XML_TEXT_NODE) { |
||
| 184 | $data = $this->getData($node); // Handle variable data property |
||
| 185 | if ($data !== null) { |
||
| 186 | $tokens[] = $this->factory->createText($data); |
||
| 187 | } |
||
| 188 | return false; |
||
| 189 | } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) { |
||
| 190 | // undo libxml's special treatment of <script> and <style> tags |
||
| 191 | $last = end($tokens); |
||
| 192 | $data = $node->data; |
||
| 193 | // (note $node->tagname is already normalized) |
||
| 194 | if ($last instanceof HTMLPurifier_Token_Start && ($last->name == 'script' || $last->name == 'style')) { |
||
| 195 | $new_data = trim($data); |
||
| 196 | if (substr($new_data, 0, 4) === '<!--') { |
||
| 197 | $data = substr($new_data, 4); |
||
| 198 | if (substr($data, -3) === '-->') { |
||
| 199 | $data = substr($data, 0, -3); |
||
| 200 | } else { |
||
| 201 | // Highly suspicious! Not sure what to do... |
||
| 202 | } |
||
| 203 | } |
||
| 204 | } |
||
| 205 | $tokens[] = $this->factory->createText($this->parseText($data, $config)); |
||
| 206 | return false; |
||
| 207 | } elseif ($node->nodeType === XML_COMMENT_NODE) { |
||
| 208 | // this is code is only invoked for comments in script/style in versions |
||
| 209 | // of libxml pre-2.6.28 (regular comments, of course, are still |
||
| 210 | // handled regularly) |
||
| 211 | $tokens[] = $this->factory->createComment($node->data); |
||
| 212 | return false; |
||
| 213 | } elseif ($node->nodeType !== XML_ELEMENT_NODE) { |
||
| 214 | // not-well tested: there may be other nodes we have to grab |
||
| 215 | return false; |
||
| 216 | } |
||
| 217 | $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array(); |
||
| 218 | $tag_name = $this->getTagName($node); // Handle variable tagName property |
||
| 219 | if (empty($tag_name)) { |
||
| 220 | return (bool) $node->childNodes->length; |
||
| 221 | } |
||
| 222 | // We still have to make sure that the element actually IS empty |
||
| 223 | if (!$node->childNodes->length) { |
||
| 224 | if ($collect) { |
||
| 225 | $tokens[] = $this->factory->createEmpty($tag_name, $attr); |
||
| 226 | } |
||
| 227 | return false; |
||
| 228 | } else { |
||
| 229 | if ($collect) { |
||
| 230 | $tokens[] = $this->factory->createStart($tag_name, $attr); |
||
| 231 | } |
||
| 232 | return true; |
||
| 233 | } |
||
| 234 | } |
||
| 235 | |||
| 236 | /** |
||
| 237 | * @param DOMNode $node |
||
| 238 | * @param HTMLPurifier_Token[] $tokens |
||
| 239 | */ |
||
| 240 | protected function createEndNode($node, &$tokens) |
||
| 241 | { |
||
| 242 | $tag_name = $this->getTagName($node); // Handle variable tagName property |
||
| 243 | $tokens[] = $this->factory->createEnd($tag_name); |
||
| 244 | } |
||
| 245 | |||
| 246 | /** |
||
| 247 | * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array. |
||
| 248 | * |
||
| 249 | * @param DOMNamedNodeMap $node_map DOMNamedNodeMap of DOMAttr objects. |
||
| 250 | * @return array Associative array of attributes. |
||
| 251 | */ |
||
| 252 | protected function transformAttrToAssoc($node_map) |
||
| 253 | { |
||
| 254 | // NamedNodeMap is documented very well, so we're using undocumented |
||
| 255 | // features, namely, the fact that it implements Iterator and |
||
| 256 | // has a ->length attribute |
||
| 257 | if ($node_map->length === 0) { |
||
| 258 | return array(); |
||
| 259 | } |
||
| 260 | $array = array(); |
||
| 261 | foreach ($node_map as $attr) { |
||
| 262 | $array[$attr->name] = $attr->value; |
||
| 263 | } |
||
| 264 | return $array; |
||
| 265 | } |
||
| 266 | |||
| 267 | /** |
||
| 268 | * An error handler that mutes all errors |
||
| 269 | * @param int $errno |
||
| 270 | * @param string $errstr |
||
| 271 | */ |
||
| 272 | public function muteErrorHandler($errno, $errstr) |
||
| 273 | { |
||
| 274 | } |
||
| 275 | |||
| 276 | /** |
||
| 277 | * Callback function for undoing escaping of stray angled brackets |
||
| 278 | * in comments |
||
| 279 | * @param array $matches |
||
| 280 | * @return string |
||
| 281 | */ |
||
| 282 | public function callbackUndoCommentSubst($matches) |
||
| 283 | { |
||
| 284 | return '<!--' . strtr($matches[1], array('&' => '&', '<' => '<')) . $matches[2]; |
||
| 285 | } |
||
| 286 | |||
| 287 | /** |
||
| 288 | * Callback function that entity-izes ampersands in comments so that |
||
| 289 | * callbackUndoCommentSubst doesn't clobber them |
||
| 290 | * @param array $matches |
||
| 291 | * @return string |
||
| 292 | */ |
||
| 293 | public function callbackArmorCommentEntities($matches) |
||
| 296 | } |
||
| 297 | |||
| 298 | /** |
||
| 299 | * Wraps an HTML fragment in the necessary HTML |
||
| 300 | * @param string $html |
||
| 301 | * @param HTMLPurifier_Config $config |
||
| 302 | * @param HTMLPurifier_Context $context |
||
| 303 | * @return string |
||
| 304 | */ |
||
| 305 | protected function wrapHTML($html, $config, $context, $use_div = true) |
||
| 330 | } |
||
| 331 | } |
||
| 332 | |||
| 333 | // vim: et sw=4 sts=4 |
||
| 334 |
This check looks for parameters that have been defined for a function or method, but which are not used in the method body.