| Total Complexity | 82 |
| Total Lines | 776 |
| Duplicated Lines | 0 % |
| Changes | 1 | ||
| Bugs | 0 | Features | 0 |
Complex classes like Emogrifier often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Emogrifier, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 11 | class Emogrifier { |
||
| 12 | /** |
||
| 13 | * @var string |
||
| 14 | */ |
||
| 15 | const ENCODING = 'UTF-8'; |
||
| 16 | |||
| 17 | /** |
||
| 18 | * @var integer |
||
| 19 | */ |
||
| 20 | const CACHE_KEY_CSS = 0; |
||
| 21 | |||
| 22 | /** |
||
| 23 | * @var integer |
||
| 24 | */ |
||
| 25 | const CACHE_KEY_SELECTOR = 1; |
||
| 26 | |||
| 27 | /** |
||
| 28 | * @var integer |
||
| 29 | */ |
||
| 30 | const CACHE_KEY_XPATH = 2; |
||
| 31 | |||
| 32 | /** |
||
| 33 | * @var integer |
||
| 34 | */ |
||
| 35 | const CACHE_KEY_CSS_DECLARATION_BLOCK = 3; |
||
| 36 | |||
| 37 | /** |
||
| 38 | * for calculating nth-of-type and nth-child selectors. |
||
| 39 | * |
||
| 40 | * @var integer |
||
| 41 | */ |
||
| 42 | const INDEX = 0; |
||
| 43 | |||
| 44 | /** |
||
| 45 | * for calculating nth-of-type and nth-child selectors. |
||
| 46 | * |
||
| 47 | * @var integer |
||
| 48 | */ |
||
| 49 | const MULTIPLIER = 1; |
||
| 50 | |||
| 51 | /** |
||
| 52 | * @var string |
||
| 53 | */ |
||
| 54 | const ID_ATTRIBUTE_MATCHER = '/(\\w+)?\\#([\\w\\-]+)/'; |
||
| 55 | |||
| 56 | /** |
||
| 57 | * @var string |
||
| 58 | */ |
||
| 59 | const CLASS_ATTRIBUTE_MATCHER = '/(\\w+|[\\*\\]])?((\\.[\\w\\-]+)+)/'; |
||
| 60 | |||
| 61 | /** |
||
| 62 | * @var string |
||
| 63 | */ |
||
| 64 | private $html = ''; |
||
| 65 | |||
| 66 | /** |
||
| 67 | * @var string |
||
| 68 | */ |
||
| 69 | private $css = ''; |
||
| 70 | |||
| 71 | /** |
||
| 72 | * @var array<string> |
||
| 73 | */ |
||
| 74 | private $unprocessableHtmlTags = array('wbr'); |
||
| 75 | |||
| 76 | /** |
||
| 77 | * @var array<array> |
||
| 78 | */ |
||
| 79 | private $caches = array( |
||
| 80 | self::CACHE_KEY_CSS => array(), |
||
| 81 | self::CACHE_KEY_SELECTOR => array(), |
||
| 82 | self::CACHE_KEY_XPATH => array(), |
||
| 83 | self::CACHE_KEY_CSS_DECLARATION_BLOCK => array(), |
||
| 84 | ); |
||
| 85 | |||
| 86 | /** |
||
| 87 | * the visited nodes with the XPath paths as array keys. |
||
| 88 | * |
||
| 89 | * @var array<\DOMNode> |
||
| 90 | */ |
||
| 91 | private $visitedNodes = array(); |
||
| 92 | |||
| 93 | /** |
||
| 94 | * the styles to apply to the nodes with the XPath paths as array keys for the outer array and the attribute names/values. |
||
| 95 | * as key/value pairs for the inner array. |
||
| 96 | * |
||
| 97 | * @var array<array><string> |
||
| 98 | */ |
||
| 99 | private $styleAttributesForNodes = array(); |
||
| 100 | |||
| 101 | /** |
||
| 102 | * This attribute applies to the case where you want to preserve your original text encoding. |
||
| 103 | * |
||
| 104 | * By default, emogrifier translates your text into HTML entities for two reasons: |
||
| 105 | * |
||
| 106 | * 1. Because of client incompatibilities, it is better practice to send out HTML entities rather than unicode over email. |
||
| 107 | * |
||
| 108 | * 2. It translates any illegal XML characters that DOMDocument cannot work with. |
||
| 109 | * |
||
| 110 | * If you would like to preserve your original encoding, set this attribute to TRUE. |
||
| 111 | * |
||
| 112 | * @var boolean |
||
| 113 | */ |
||
| 114 | public $preserveEncoding = false; |
||
| 115 | |||
| 116 | public static $_media = ''; |
||
| 117 | |||
| 118 | /** |
||
| 119 | * The constructor. |
||
| 120 | * |
||
| 121 | * @param string $html the HTML to emogrify, must be UTF-8-encoded |
||
| 122 | * @param string $css the CSS to merge, must be UTF-8-encoded |
||
| 123 | */ |
||
| 124 | public function __construct($html = '', $css = '') { |
||
| 125 | $this->setHtml($html); |
||
| 126 | $this->setCss($css); |
||
| 127 | } |
||
| 128 | |||
| 129 | /** |
||
| 130 | * The destructor. |
||
| 131 | */ |
||
| 132 | public function __destruct() { |
||
| 133 | $this->purgeVisitedNodes(); |
||
| 134 | } |
||
| 135 | |||
| 136 | /** |
||
| 137 | * Sets the HTML to emogrify. |
||
| 138 | * |
||
| 139 | * @param string $html the HTML to emogrify, must be UTF-8-encoded |
||
| 140 | */ |
||
| 141 | public function setHtml($html = '') { |
||
| 142 | $this->html = $html; |
||
| 143 | } |
||
| 144 | |||
| 145 | /** |
||
| 146 | * Sets the CSS to merge with the HTML. |
||
| 147 | * |
||
| 148 | * @param string $css the CSS to merge, must be UTF-8-encoded |
||
| 149 | */ |
||
| 150 | public function setCss($css = '') { |
||
| 151 | $this->css = $css; |
||
| 152 | } |
||
| 153 | |||
| 154 | /** |
||
| 155 | * Clears all caches. |
||
| 156 | */ |
||
| 157 | private function clearAllCaches() { |
||
| 158 | $this->clearCache(self::CACHE_KEY_CSS); |
||
| 159 | $this->clearCache(self::CACHE_KEY_SELECTOR); |
||
| 160 | $this->clearCache(self::CACHE_KEY_XPATH); |
||
| 161 | $this->clearCache(self::CACHE_KEY_CSS_DECLARATION_BLOCK); |
||
| 162 | } |
||
| 163 | |||
| 164 | /** |
||
| 165 | * Clears a single cache by key. |
||
| 166 | * |
||
| 167 | * @param integer $key the cache key, must be CACHE_KEY_CSS, CACHE_KEY_SELECTOR, CACHE_KEY_XPATH or CACHE_KEY_CSS_DECLARATION_BLOCK |
||
| 168 | * |
||
| 169 | * @throws InvalidArgumentException |
||
| 170 | */ |
||
| 171 | private function clearCache($key) { |
||
| 172 | $allowedCacheKeys = array(self::CACHE_KEY_CSS, self::CACHE_KEY_SELECTOR, self::CACHE_KEY_XPATH, self::CACHE_KEY_CSS_DECLARATION_BLOCK); |
||
| 173 | if (!in_array($key, $allowedCacheKeys, true)) { |
||
| 174 | throw new InvalidArgumentException('Invalid cache key: ' . $key, 1391822035); |
||
| 175 | } |
||
| 176 | |||
| 177 | $this->caches[$key] = array(); |
||
| 178 | } |
||
| 179 | |||
| 180 | /** |
||
| 181 | * Purges the visited nodes. |
||
| 182 | */ |
||
| 183 | private function purgeVisitedNodes() { |
||
| 184 | $this->visitedNodes = array(); |
||
| 185 | $this->styleAttributesForNodes = array(); |
||
| 186 | } |
||
| 187 | |||
| 188 | /** |
||
| 189 | * Marks a tag for removal. |
||
| 190 | * |
||
| 191 | * There are some HTML tags that DOMDocument cannot process, and it will throw an error if it encounters them. |
||
| 192 | * In particular, DOMDocument will complain if you try to use HTML5 tags in an XHTML document. |
||
| 193 | * |
||
| 194 | * Note: The tags will not be removed if they have any content. |
||
| 195 | * |
||
| 196 | * @param string $tagName the tag name, e.g., "p" |
||
| 197 | */ |
||
| 198 | public function addUnprocessableHtmlTag($tagName) { |
||
| 199 | $this->unprocessableHtmlTags[] = $tagName; |
||
| 200 | } |
||
| 201 | |||
| 202 | /** |
||
| 203 | * Drops a tag from the removal list. |
||
| 204 | * |
||
| 205 | * @param string $tagName the tag name, e.g., "p" |
||
| 206 | */ |
||
| 207 | public function removeUnprocessableHtmlTag($tagName) { |
||
| 211 | } |
||
| 212 | } |
||
| 213 | |||
| 214 | /** |
||
| 215 | * Applies the CSS you submit to the HTML you submit. |
||
| 216 | * |
||
| 217 | * This method places the CSS inline. |
||
| 218 | * |
||
| 219 | * @return string |
||
| 220 | * |
||
| 221 | * @throws BadMethodCallException |
||
| 222 | */ |
||
| 223 | public function emogrify() { |
||
| 224 | if ($this->html === '') { |
||
| 225 | throw new BadMethodCallException('Please set some HTML first before calling emogrify.', 1390393096); |
||
| 226 | } |
||
| 227 | |||
| 228 | $xmlDocument = $this->createXmlDocument(); |
||
| 229 | $xpath = new DOMXPath($xmlDocument); |
||
| 230 | $this->clearAllCaches(); |
||
| 231 | |||
| 232 | // before be begin processing the CSS file, parse the document and normalize all existing CSS attributes (changes 'DISPLAY: none' to 'display: none'); |
||
| 233 | // we wouldn't have to do this if DOMXPath supported XPath 2.0. |
||
| 234 | // also store a reference of nodes with existing inline styles so we don't overwrite them |
||
| 235 | $this->purgeVisitedNodes(); |
||
| 236 | |||
| 237 | $nodesWithStyleAttributes = $xpath->query('//*[@style]'); |
||
| 238 | if ($nodesWithStyleAttributes !== false) { |
||
| 239 | /** @var $nodeWithStyleAttribute DOMNode */ |
||
| 240 | foreach ($nodesWithStyleAttributes as $node) { |
||
| 241 | $normalizedOriginalStyle = preg_replace_callback( '/[A-z\\-]+(?=\\:)/S', array( $this, 'strtolower' ), $node->getAttribute('style') ); |
||
| 242 | |||
| 243 | // in order to not overwrite existing style attributes in the HTML, we have to save the original HTML styles |
||
| 244 | $nodePath = $node->getNodePath(); |
||
| 245 | if (!isset($this->styleAttributesForNodes[$nodePath])) { |
||
| 246 | $this->styleAttributesForNodes[$nodePath] = $this->parseCssDeclarationBlock($normalizedOriginalStyle); |
||
| 247 | $this->visitedNodes[$nodePath] = $node; |
||
| 248 | } |
||
| 249 | |||
| 250 | $node->setAttribute('style', $normalizedOriginalStyle); |
||
| 251 | } |
||
| 252 | } |
||
| 253 | |||
| 254 | // grab any existing style blocks from the html and append them to the existing CSS |
||
| 255 | // (these blocks should be appended so as to have precedence over conflicting styles in the existing CSS) |
||
| 256 | $allCss = $this->css; |
||
| 257 | |||
| 258 | $allCss .= $this->getCssFromAllStyleNodes($xpath); |
||
| 259 | |||
| 260 | $cssParts = $this->splitCssAndMediaQuery($allCss); |
||
| 261 | self::$_media = ''; // reset |
||
| 262 | |||
| 263 | $cssKey = md5($cssParts['css']); |
||
| 264 | if (!isset($this->caches[self::CACHE_KEY_CSS][$cssKey])) { |
||
| 265 | // process the CSS file for selectors and definitions |
||
| 266 | preg_match_all('/(?:^|[\\s^{}]*)([^{]+){([^}]*)}/mis', $cssParts['css'], $matches, PREG_SET_ORDER); |
||
| 267 | |||
| 268 | $allSelectors = array(); |
||
| 269 | foreach ($matches as $key => $selectorString) { |
||
| 270 | // if there is a blank definition, skip |
||
| 271 | if (!strlen(trim($selectorString[2]))) { |
||
| 272 | continue; |
||
| 273 | } |
||
| 274 | |||
| 275 | // else split by commas and duplicate attributes so we can sort by selector precedence |
||
| 276 | $selectors = explode(',', $selectorString[1]); |
||
| 277 | foreach ($selectors as $selector) { |
||
| 278 | // don't process pseudo-elements and behavioral (dynamic) pseudo-classes; ONLY allow structural pseudo-classes |
||
| 279 | if (strpos($selector, ':') !== false && !preg_match('/:\\S+\\-(child|type)\\(/i', $selector)) { |
||
| 280 | continue; |
||
| 281 | } |
||
| 282 | |||
| 283 | $allSelectors[] = array('selector' => trim($selector), |
||
| 284 | 'attributes' => trim($selectorString[2]), |
||
| 285 | // keep track of where it appears in the file, since order is important |
||
| 286 | 'line' => $key, |
||
| 287 | ); |
||
| 288 | } |
||
| 289 | } |
||
| 290 | |||
| 291 | // now sort the selectors by precedence |
||
| 292 | usort($allSelectors, array($this,'sortBySelectorPrecedence')); |
||
| 293 | |||
| 294 | $this->caches[self::CACHE_KEY_CSS][$cssKey] = $allSelectors; |
||
| 295 | } |
||
| 296 | |||
| 297 | foreach ($this->caches[self::CACHE_KEY_CSS][$cssKey] as $value) { |
||
| 298 | // query the body for the xpath selector |
||
| 299 | $nodesMatchingCssSelectors = $xpath->query($this->translateCssToXpath($value['selector'])); |
||
| 300 | |||
| 301 | /** @var $node \DOMNode */ |
||
| 302 | foreach ($nodesMatchingCssSelectors as $node) { |
||
| 303 | // if it has a style attribute, get it, process it, and append (overwrite) new stuff |
||
| 304 | if ($node->hasAttribute('style')) { |
||
| 305 | // break it up into an associative array |
||
| 306 | $oldStyleDeclarations = $this->parseCssDeclarationBlock($node->getAttribute('style')); |
||
| 307 | } else { |
||
| 308 | $oldStyleDeclarations = array(); |
||
| 309 | } |
||
| 310 | $newStyleDeclarations = $this->parseCssDeclarationBlock($value['attributes']); |
||
| 311 | $node->setAttribute('style', $this->generateStyleStringFromDeclarationsArrays($oldStyleDeclarations, $newStyleDeclarations)); |
||
| 312 | } |
||
| 313 | } |
||
| 314 | |||
| 315 | // now iterate through the nodes that contained inline styles in the original HTML |
||
| 316 | foreach ($this->styleAttributesForNodes as $nodePath => $styleAttributesForNode) { |
||
| 317 | $node = $this->visitedNodes[$nodePath]; |
||
| 318 | $currentStyleAttributes = $this->parseCssDeclarationBlock($node->getAttribute('style')); |
||
| 319 | $node->setAttribute('style', $this->generateStyleStringFromDeclarationsArrays($currentStyleAttributes, $styleAttributesForNode)); |
||
| 320 | } |
||
| 321 | |||
| 322 | // This removes styles from your email that contain display:none. |
||
| 323 | // We need to look for display:none, but we need to do a case-insensitive search. Since DOMDocument only supports XPath 1.0, |
||
| 324 | // lower-case() isn't available to us. We've thus far only set attributes to lowercase, not attribute values. Consequently, we need |
||
| 325 | // to translate() the letters that would be in 'NONE' ("NOE") to lowercase. |
||
| 326 | $nodesWithStyleDisplayNone = $xpath->query('//*[contains(translate(translate(@style," ",""),"NOE","noe"),"display:none")]'); |
||
| 327 | // The checks on parentNode and is_callable below ensure that if we've deleted the parent node, |
||
| 328 | // we don't try to call removeChild on a nonexistent child node |
||
| 329 | if ($nodesWithStyleDisplayNone->length > 0) { |
||
| 330 | /** @var $node \DOMNode */ |
||
| 331 | foreach ($nodesWithStyleDisplayNone as $node) { |
||
| 332 | if ($node->parentNode && is_callable(array($node->parentNode,'removeChild'))) { |
||
| 333 | $node->parentNode->removeChild($node); |
||
| 334 | } |
||
| 335 | } |
||
| 336 | } |
||
| 337 | |||
| 338 | $this->copyCssWithMediaToStyleNode($cssParts, $xmlDocument); |
||
| 339 | |||
| 340 | if ($this->preserveEncoding) { |
||
| 341 | if ( function_exists( 'mb_convert_encoding' ) ) { |
||
| 342 | return mb_convert_encoding( $xmlDocument->saveHTML(), self::ENCODING, 'HTML-ENTITIES' ); |
||
| 343 | } else { |
||
| 344 | return htmlspecialchars_decode( utf8_encode( html_entity_decode( $xmlDocument->saveHTML(), ENT_COMPAT, self::ENCODING ) ) ); |
||
| 345 | } |
||
| 346 | } else { |
||
| 347 | return $xmlDocument->saveHTML(); |
||
| 348 | } |
||
| 349 | } |
||
| 350 | |||
| 351 | public function strtolower(array $m) { |
||
| 352 | return strtolower($m[0]); |
||
| 353 | } |
||
| 354 | |||
| 355 | |||
| 356 | /** |
||
| 357 | * This method merges old or existing name/value array with new name/value array. |
||
| 358 | * and then generates a string of the combined style suitable for placing inline. |
||
| 359 | * This becomes the single point for CSS string generation allowing for consistent. |
||
| 360 | * CSS output no matter where the CSS originally came from. |
||
| 361 | * @param array $oldStyles |
||
| 362 | * @param array $newStyles |
||
| 363 | * @return string |
||
| 364 | */ |
||
| 365 | private function generateStyleStringFromDeclarationsArrays(array $oldStyles, array $newStyles) { |
||
| 366 | $combinedStyles = array_merge($oldStyles, $newStyles); |
||
| 367 | $style = ''; |
||
| 368 | foreach ($combinedStyles as $attributeName => $attributeValue) { |
||
| 369 | $style .= (strtolower(trim($attributeName)) . ': ' . trim($attributeValue) . '; '); |
||
| 370 | } |
||
| 371 | return trim($style); |
||
| 372 | } |
||
| 373 | |||
| 374 | |||
| 375 | /** |
||
| 376 | * Copies the media part from CSS array parts to $xmlDocument. |
||
| 377 | * |
||
| 378 | * @param array $cssParts |
||
| 379 | * @param DOMDocument $xmlDocument |
||
| 380 | */ |
||
| 381 | public function copyCssWithMediaToStyleNode(array $cssParts, DOMDocument $xmlDocument) { |
||
| 382 | if (isset($cssParts['media']) && $cssParts['media'] !== '') { |
||
| 383 | $this->addStyleElementToDocument($xmlDocument, $cssParts['media']); |
||
| 384 | } |
||
| 385 | } |
||
| 386 | |||
| 387 | /** |
||
| 388 | * Returns CSS content. |
||
| 389 | * |
||
| 390 | * @param DOMXPath $xpath |
||
| 391 | * @return string |
||
| 392 | */ |
||
| 393 | private function getCssFromAllStyleNodes(DOMXPath $xpath) { |
||
| 394 | $styleNodes = $xpath->query('//style'); |
||
| 395 | |||
| 396 | if ($styleNodes === false) { |
||
| 397 | return ''; |
||
| 398 | } |
||
| 399 | |||
| 400 | $css = ''; |
||
| 401 | /** @var $styleNode DOMNode */ |
||
| 402 | foreach ($styleNodes as $styleNode) { |
||
| 403 | $css .= "\n\n" . $styleNode->nodeValue; |
||
| 404 | $styleNode->parentNode->removeChild($styleNode); |
||
| 405 | } |
||
| 406 | |||
| 407 | return $css; |
||
| 408 | } |
||
| 409 | |||
| 410 | /** |
||
| 411 | * Adds a style element with $css to $document. |
||
| 412 | * |
||
| 413 | * @param DOMDocument $document |
||
| 414 | * @param string $css |
||
| 415 | */ |
||
| 416 | private function addStyleElementToDocument(DOMDocument $document, $css) { |
||
| 417 | $styleElement = $document->createElement('style', $css); |
||
| 418 | $styleAttribute = $document->createAttribute('type'); |
||
| 419 | $styleAttribute->value = 'text/css'; |
||
| 420 | $styleElement->appendChild($styleAttribute); |
||
| 421 | |||
| 422 | $head = $this->getOrCreateHeadElement($document); |
||
| 423 | $head->appendChild($styleElement); |
||
| 424 | } |
||
| 425 | |||
| 426 | /** |
||
| 427 | * Returns the existing or creates a new head element in $document. |
||
| 428 | * |
||
| 429 | * @param DOMDocument $document |
||
| 430 | * @return DOMNode the head element |
||
| 431 | */ |
||
| 432 | private function getOrCreateHeadElement(DOMDocument $document) { |
||
| 433 | $head = $document->getElementsByTagName('head')->item(0); |
||
| 434 | |||
| 435 | if ($head === null) { |
||
| 436 | $head = $document->createElement('head'); |
||
| 437 | $html = $document->getElementsByTagName('html')->item(0); |
||
| 438 | $html->insertBefore($head, $document->getElementsByTagName('body')->item(0)); |
||
| 439 | } |
||
| 440 | |||
| 441 | return $head; |
||
| 442 | } |
||
| 443 | |||
| 444 | /** |
||
| 445 | * Splits input CSS code to an array where: |
||
| 446 | * |
||
| 447 | * - key "css" will be contains clean CSS code. |
||
| 448 | * - key "media" will be contains all valuable media queries. |
||
| 449 | * |
||
| 450 | * Example: |
||
| 451 | * |
||
| 452 | * The CSS code. |
||
| 453 | * |
||
| 454 | * "@import "file.css"; h1 { color:red; } @media { h1 {}} @media tv { h1 {}}" |
||
| 455 | * |
||
| 456 | * will be parsed into the following array: |
||
| 457 | * |
||
| 458 | * "css" => "h1 { color:red; }" |
||
| 459 | * "media" => "@media { h1 {}}" |
||
| 460 | * |
||
| 461 | * @param string $css |
||
| 462 | * @return array |
||
| 463 | */ |
||
| 464 | private function splitCssAndMediaQuery($css) { |
||
| 465 | $css = preg_replace_callback( '#@media\\s+(?:only\\s)?(?:[\\s{\(]|screen|all)\\s?[^{]+{.*}\\s*}\\s*#misU', array( $this, '_media_concat' ), $css ); |
||
| 466 | |||
| 467 | // filter the CSS |
||
| 468 | $search = array( |
||
| 469 | // get rid of css comment code |
||
| 470 | '/\\/\\*.*\\*\\//sU', |
||
| 471 | // strip out any import directives |
||
| 472 | '/^\\s*@import\\s[^;]+;/misU', |
||
| 473 | // strip remains media enclosures |
||
| 474 | '/^\\s*@media\\s[^{]+{(.*)}\\s*}\\s/misU', |
||
| 475 | ); |
||
| 476 | |||
| 477 | $replace = array( |
||
| 478 | '', |
||
| 479 | '', |
||
| 480 | '', |
||
| 481 | ); |
||
| 482 | |||
| 483 | // clean CSS before output |
||
| 484 | $css = preg_replace($search, $replace, $css); |
||
| 485 | |||
| 486 | return array('css' => $css, 'media' => self::$_media); |
||
| 487 | } |
||
| 488 | |||
| 489 | private function _media_concat( $matches ) { |
||
| 490 | self::$_media .= $matches[0]; |
||
| 491 | } |
||
| 492 | |||
| 493 | /** |
||
| 494 | * Creates a DOMDocument instance with the current HTML. |
||
| 495 | * |
||
| 496 | * @return DOMDocument |
||
| 497 | */ |
||
| 498 | private function createXmlDocument() { |
||
| 499 | $xmlDocument = new DOMDocument; |
||
| 500 | $xmlDocument->encoding = self::ENCODING; |
||
| 501 | $xmlDocument->strictErrorChecking = false; |
||
| 502 | $xmlDocument->formatOutput = true; |
||
| 503 | $libXmlState = libxml_use_internal_errors(true); |
||
| 504 | $xmlDocument->loadHTML($this->getUnifiedHtml()); |
||
| 505 | libxml_clear_errors(); |
||
| 506 | libxml_use_internal_errors($libXmlState); |
||
| 507 | $xmlDocument->normalizeDocument(); |
||
| 508 | |||
| 509 | return $xmlDocument; |
||
| 510 | } |
||
| 511 | |||
| 512 | /** |
||
| 513 | * Returns the HTML with the non-ASCII characters converts into HTML entities and the unprocessable HTML tags removed. |
||
| 514 | * |
||
| 515 | * @return string the unified HTML |
||
| 516 | * |
||
| 517 | * @throws BadMethodCallException |
||
| 518 | */ |
||
| 519 | private function getUnifiedHtml() { |
||
| 520 | if (!empty($this->unprocessableHtmlTags)) { |
||
| 521 | $unprocessableHtmlTags = implode('|', $this->unprocessableHtmlTags); |
||
| 522 | $bodyWithoutUnprocessableTags = preg_replace('/<\\/?(' . $unprocessableHtmlTags . ')[^>]*>/i', '', $this->html); |
||
| 523 | } else { |
||
| 524 | $bodyWithoutUnprocessableTags = $this->html; |
||
| 525 | } |
||
| 526 | |||
| 527 | if ( function_exists( 'mb_convert_encoding' ) ) { |
||
| 528 | return mb_convert_encoding( $bodyWithoutUnprocessableTags, 'HTML-ENTITIES', self::ENCODING ); |
||
| 529 | } else { |
||
| 530 | return htmlspecialchars_decode( utf8_decode( htmlentities( $bodyWithoutUnprocessableTags, ENT_COMPAT, self::ENCODING, false ) ) ); |
||
| 531 | } |
||
| 532 | } |
||
| 533 | |||
| 534 | /** |
||
| 535 | * @param array $a |
||
| 536 | * @param array $b |
||
| 537 | * |
||
| 538 | * @return integer |
||
| 539 | */ |
||
| 540 | private function sortBySelectorPrecedence(array $a, array $b) { |
||
| 541 | $precedenceA = $this->getCssSelectorPrecedence($a['selector']); |
||
| 542 | $precedenceB = $this->getCssSelectorPrecedence($b['selector']); |
||
| 543 | |||
| 544 | // We want these sorted in ascending order so selectors with lesser precedence get processed first and |
||
| 545 | // selectors with greater precedence get sorted last. |
||
| 546 | // The parenthesis around the -1 are necessary to avoid a PHP_CodeSniffer warning about missing spaces around |
||
| 547 | // arithmetic operators. |
||
| 548 | // @see http://forge.typo3.org/issues/55605 |
||
| 549 | $precedenceForEquals = ($a['line'] < $b['line'] ? (-1) : 1); |
||
| 550 | $precedenceForNotEquals = ($precedenceA < $precedenceB ? (-1) : 1); |
||
| 551 | return ($precedenceA === $precedenceB) ? $precedenceForEquals : $precedenceForNotEquals; |
||
| 552 | } |
||
| 553 | |||
| 554 | /** |
||
| 555 | * @param string $selector |
||
| 556 | * |
||
| 557 | * @return integer |
||
| 558 | */ |
||
| 559 | private function getCssSelectorPrecedence($selector) { |
||
| 560 | $selectorKey = md5($selector); |
||
| 561 | if (!isset($this->caches[self::CACHE_KEY_SELECTOR][$selectorKey])) { |
||
| 562 | $precedence = 0; |
||
| 563 | $value = 100; |
||
| 564 | // ids: worth 100, classes: worth 10, elements: worth 1 |
||
| 565 | $search = array('\\#','\\.',''); |
||
| 566 | |||
| 567 | foreach ($search as $s) { |
||
| 568 | if (trim($selector == '')) { |
||
| 569 | break; |
||
| 570 | } |
||
| 571 | $number = 0; |
||
| 572 | $selector = preg_replace('/' . $s . '\\w+/', '', $selector, -1, $number); |
||
| 573 | $precedence += ($value * $number); |
||
| 574 | $value /= 10; |
||
| 575 | } |
||
| 576 | $this->caches[self::CACHE_KEY_SELECTOR][$selectorKey] = $precedence; |
||
| 577 | } |
||
| 578 | |||
| 579 | return $this->caches[self::CACHE_KEY_SELECTOR][$selectorKey]; |
||
| 580 | } |
||
| 581 | |||
| 582 | /** |
||
| 583 | * Right now, we support all CSS 1 selectors and most CSS2/3 selectors. |
||
| 584 | * |
||
| 585 | * @see http://plasmasturm.org/log/444/ |
||
| 586 | * |
||
| 587 | * @param string $paramCssSelector |
||
| 588 | * |
||
| 589 | * @return string |
||
| 590 | */ |
||
| 591 | private function translateCssToXpath($paramCssSelector) { |
||
| 592 | $cssSelector = ' ' . $paramCssSelector . ' '; |
||
| 593 | $cssSelector = preg_replace_callback( '/\s+\w+\s+/', array( $this, 'strtolower' ), $cssSelector ); |
||
| 594 | $cssSelector = trim($cssSelector); |
||
| 595 | $xpathKey = md5($cssSelector); |
||
| 596 | if (!isset($this->caches[self::CACHE_KEY_XPATH][$xpathKey])) { |
||
| 597 | // returns an Xpath selector |
||
| 598 | $search = array( |
||
| 599 | // Matches any element that is a child of parent. |
||
| 600 | '/\\s+>\\s+/', |
||
| 601 | // Matches any element that is an adjacent sibling. |
||
| 602 | '/\\s+\\+\\s+/', |
||
| 603 | // Matches any element that is a descendant of an parent element element. |
||
| 604 | '/\\s+/', |
||
| 605 | // first-child pseudo-selector |
||
| 606 | '/([^\\/]+):first-child/i', |
||
| 607 | // last-child pseudo-selector |
||
| 608 | '/([^\\/]+):last-child/i', |
||
| 609 | // Matches attribute only selector |
||
| 610 | '/^\\[(\\w+)\\]/', |
||
| 611 | // Matches element with attribute |
||
| 612 | '/(\\w)\\[(\\w+)\\]/', |
||
| 613 | // Matches element with EXACT attribute |
||
| 614 | '/(\\w)\\[(\\w+)\\=[\'"]?(\\w+)[\'"]?\\]/', |
||
| 615 | ); |
||
| 616 | $replace = array( |
||
| 617 | '/', |
||
| 618 | '/following-sibling::*[1]/self::', |
||
| 619 | '//', |
||
| 620 | '*[1]/self::\\1', |
||
| 621 | '*[last()]/self::\\1', |
||
| 622 | '*[@\\1]', |
||
| 623 | '\\1[@\\2]', |
||
| 624 | '\\1[@\\2="\\3"]', |
||
| 625 | ); |
||
| 626 | |||
| 627 | $cssSelector = '//' . preg_replace($search, $replace, $cssSelector); |
||
| 628 | |||
| 629 | $cssSelector = preg_replace_callback(self::ID_ATTRIBUTE_MATCHER, array($this, 'matchIdAttributes'), $cssSelector); |
||
| 630 | $cssSelector = preg_replace_callback(self::CLASS_ATTRIBUTE_MATCHER, array($this, 'matchClassAttributes'), $cssSelector); |
||
| 631 | |||
| 632 | // Advanced selectors are going to require a bit more advanced emogrification. |
||
| 633 | // When we required PHP 5.3, we could do this with closures. |
||
| 634 | $cssSelector = preg_replace_callback( |
||
| 635 | '/([^\\/]+):nth-child\\(\s*(odd|even|[+\-]?\\d|[+\\-]?\\d?n(\\s*[+\\-]\\s*\\d)?)\\s*\\)/i', |
||
| 636 | array($this, 'translateNthChild'), $cssSelector |
||
| 637 | ); |
||
| 638 | $cssSelector = preg_replace_callback( |
||
| 639 | '/([^\\/]+):nth-of-type\\(\s*(odd|even|[+\-]?\\d|[+\\-]?\\d?n(\\s*[+\\-]\\s*\\d)?)\\s*\\)/i', |
||
| 640 | array($this, 'translateNthOfType'), $cssSelector |
||
| 641 | ); |
||
| 642 | |||
| 643 | $this->caches[self::CACHE_KEY_SELECTOR][$xpathKey] = $cssSelector; |
||
| 644 | } |
||
| 645 | return $this->caches[self::CACHE_KEY_SELECTOR][$xpathKey]; |
||
| 646 | } |
||
| 647 | |||
| 648 | /** |
||
| 649 | * @param array $match |
||
| 650 | * |
||
| 651 | * @return string |
||
| 652 | */ |
||
| 653 | private function matchIdAttributes(array $match) { |
||
| 654 | return (strlen($match[1]) ? $match[1] : '*') . '[@id="' . $match[2] . '"]'; |
||
| 655 | } |
||
| 656 | |||
| 657 | /** |
||
| 658 | * @param array $match |
||
| 659 | * |
||
| 660 | * @return string |
||
| 661 | */ |
||
| 662 | private function matchClassAttributes(array $match) { |
||
| 663 | return (strlen($match[1]) ? $match[1] : '*') . '[contains(concat(" ",@class," "),concat(" ","' . |
||
| 664 | implode( |
||
| 665 | '"," "))][contains(concat(" ",@class," "),concat(" ","', |
||
| 666 | explode('.', substr($match[2], 1)) |
||
| 667 | ) . '"," "))]'; |
||
| 668 | } |
||
| 669 | |||
| 670 | /** |
||
| 671 | * @param array $match |
||
| 672 | * |
||
| 673 | * @return string |
||
| 674 | */ |
||
| 675 | private function translateNthChild(array $match) { |
||
| 676 | $result = $this->parseNth($match); |
||
| 677 | |||
| 678 | if (isset($result[self::MULTIPLIER])) { |
||
| 679 | if ($result[self::MULTIPLIER] < 0) { |
||
| 680 | $result[self::MULTIPLIER] = abs($result[self::MULTIPLIER]); |
||
| 681 | return sprintf('*[(last() - position()) mod %u = %u]/self::%s', $result[self::MULTIPLIER], $result[self::INDEX], $match[1]); |
||
| 682 | } else { |
||
| 683 | return sprintf('*[position() mod %u = %u]/self::%s', $result[self::MULTIPLIER], $result[self::INDEX], $match[1]); |
||
| 684 | } |
||
| 685 | } else { |
||
| 686 | return sprintf('*[%u]/self::%s', $result[self::INDEX], $match[1]); |
||
| 687 | } |
||
| 688 | } |
||
| 689 | |||
| 690 | /** |
||
| 691 | * @param array $match |
||
| 692 | * |
||
| 693 | * @return string |
||
| 694 | */ |
||
| 695 | private function translateNthOfType(array $match) { |
||
| 696 | $result = $this->parseNth($match); |
||
| 697 | |||
| 698 | if (isset($result[self::MULTIPLIER])) { |
||
| 699 | if ($result[self::MULTIPLIER] < 0) { |
||
| 700 | $result[self::MULTIPLIER] = abs($result[self::MULTIPLIER]); |
||
| 701 | return sprintf('%s[(last() - position()) mod %u = %u]', $match[1], $result[self::MULTIPLIER], $result[self::INDEX]); |
||
| 702 | } else { |
||
| 703 | return sprintf('%s[position() mod %u = %u]', $match[1], $result[self::MULTIPLIER], $result[self::INDEX]); |
||
| 704 | } |
||
| 705 | } else { |
||
| 706 | return sprintf('%s[%u]', $match[1], $result[self::INDEX]); |
||
| 707 | } |
||
| 708 | } |
||
| 709 | |||
| 710 | /** |
||
| 711 | * @param array $match |
||
| 712 | * |
||
| 713 | * @return array |
||
| 714 | */ |
||
| 715 | private function parseNth(array $match) { |
||
| 716 | if (in_array(strtolower($match[2]), array('even','odd'))) { |
||
| 717 | $index = strtolower($match[2]) == 'even' ? 0 : 1; |
||
| 718 | return array(self::MULTIPLIER => 2, self::INDEX => $index); |
||
| 719 | } elseif (stripos($match[2], 'n') === false) { |
||
| 720 | // if there is a multiplier |
||
| 721 | $index = intval(str_replace(' ', '', $match[2])); |
||
| 722 | return array(self::INDEX => $index); |
||
| 723 | } else { |
||
| 724 | if (isset($match[3])) { |
||
| 725 | $multipleTerm = str_replace($match[3], '', $match[2]); |
||
| 726 | $index = intval(str_replace(' ', '', $match[3])); |
||
| 727 | } else { |
||
| 728 | $multipleTerm = $match[2]; |
||
| 729 | $index = 0; |
||
| 730 | } |
||
| 731 | |||
| 732 | $multiplier = str_ireplace('n', '', $multipleTerm); |
||
| 733 | |||
| 734 | if (!strlen($multiplier)) { |
||
| 735 | $multiplier = 1; |
||
| 736 | } elseif ($multiplier == 0) { |
||
| 737 | return array(self::INDEX => $index); |
||
| 738 | } else { |
||
| 739 | $multiplier = intval($multiplier); |
||
| 740 | } |
||
| 741 | |||
| 742 | while ($index < 0) { |
||
| 743 | $index += abs($multiplier); |
||
| 744 | } |
||
| 745 | |||
| 746 | return array(self::MULTIPLIER => $multiplier, self::INDEX => $index); |
||
| 747 | } |
||
| 748 | } |
||
| 749 | |||
| 750 | /** |
||
| 751 | * Parses a CSS declaration block into property name/value pairs. |
||
| 752 | * |
||
| 753 | * Example: |
||
| 754 | * |
||
| 755 | * The declaration block. |
||
| 756 | * |
||
| 757 | * "color: #000; font-weight: bold;". |
||
| 758 | * |
||
| 759 | * will be parsed into the following array: |
||
| 760 | * |
||
| 761 | * "color" => "#000" |
||
| 762 | * "font-weight" => "bold" |
||
| 763 | * |
||
| 764 | * @param string $cssDeclarationBlock the CSS declaration block without the curly braces, may be empty |
||
| 765 | * |
||
| 766 | * @return array the CSS declarations with the property names as array keys and the property values as array values |
||
| 767 | */ |
||
| 768 | private function parseCssDeclarationBlock($cssDeclarationBlock) { |
||
| 787 | } |
||
| 788 | } |
||
| 789 |