| Total Complexity | 99 |
| Total Lines | 572 |
| Duplicated Lines | 0 % |
| Changes | 0 | ||
Complex classes like NodeTrait often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use NodeTrait, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 14 | trait NodeTrait |
||
| 15 | { |
||
| 16 | /** |
||
| 17 | * Content score of the node. Used to determine the value of the content. |
||
| 18 | * |
||
| 19 | * @var int |
||
| 20 | */ |
||
| 21 | public $contentScore = 0; |
||
| 22 | |||
| 23 | /** |
||
| 24 | * Flag for initialized status. |
||
| 25 | * |
||
| 26 | * @var bool |
||
| 27 | */ |
||
| 28 | private $initialized = false; |
||
| 29 | |||
| 30 | /** |
||
| 31 | * Flag data tables. |
||
| 32 | * |
||
| 33 | * @var bool |
||
| 34 | */ |
||
| 35 | private $readabilityDataTable = false; |
||
| 36 | |||
| 37 | /** |
||
| 38 | * @var array |
||
| 39 | */ |
||
| 40 | private $divToPElements = [ |
||
| 41 | 'a', |
||
| 42 | 'blockquote', |
||
| 43 | 'dl', |
||
| 44 | 'div', |
||
| 45 | 'img', |
||
| 46 | 'ol', |
||
| 47 | 'p', |
||
| 48 | 'pre', |
||
| 49 | 'table', |
||
| 50 | 'ul', |
||
| 51 | 'select', |
||
| 52 | ]; |
||
| 53 | |||
| 54 | /** |
||
| 55 | * The commented out elements qualify as phrasing content but tend to be |
||
| 56 | * removed by readability when put into paragraphs, so we ignore them here. |
||
| 57 | * |
||
| 58 | * @var array |
||
| 59 | */ |
||
| 60 | private $phrasing_elems = [ |
||
| 61 | // 'CANVAS', 'IFRAME', 'SVG', 'VIDEO', |
||
| 62 | 'abbr', 'audio', 'b', 'bdo', 'br', 'button', 'cite', 'code', 'data', |
||
| 63 | 'datalist', 'dfn', 'em', 'embed', 'i', 'img', 'input', 'kbd', 'label', |
||
| 64 | 'mark', 'math', 'meter', 'noscript', 'object', 'output', 'progress', 'q', |
||
| 65 | 'ruby', 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', |
||
| 66 | 'sup', 'textarea', 'time', 'var', 'wbr' |
||
| 67 | ]; |
||
| 68 | |||
| 69 | /** |
||
| 70 | * initialized getter. |
||
| 71 | * |
||
| 72 | * @return bool |
||
| 73 | */ |
||
| 74 | public function isInitialized() |
||
| 75 | { |
||
| 76 | return $this->initialized; |
||
| 77 | } |
||
| 78 | |||
| 79 | /** |
||
| 80 | * @return bool |
||
| 81 | */ |
||
| 82 | public function isReadabilityDataTable() |
||
| 96 | // return $this->readabilityDataTable; |
||
| 97 | } |
||
| 98 | |||
| 99 | /** |
||
| 100 | * @param bool $param |
||
| 101 | */ |
||
| 102 | public function setReadabilityDataTable($param) |
||
| 103 | { |
||
| 104 | // Can't be "true" because DOMDocument casts it to "1" |
||
| 105 | $this->setAttribute('readabilityDataTable', $param ? '1' : '0'); |
||
|
|
|||
| 106 | // $this->readabilityDataTable = $param; |
||
| 107 | } |
||
| 108 | |||
| 109 | /** |
||
| 110 | * Initializer. Calculates the current score of the node and returns a full Readability object. |
||
| 111 | * |
||
| 112 | * @ TODO: I don't like the weightClasses param. How can we get the config here? |
||
| 113 | * |
||
| 114 | * @param $weightClasses bool Weight classes? |
||
| 115 | * |
||
| 116 | * @return static |
||
| 117 | */ |
||
| 118 | public function initializeNode($weightClasses) |
||
| 119 | { |
||
| 120 | if (!$this->isInitialized()) { |
||
| 121 | $contentScore = 0; |
||
| 122 | |||
| 123 | switch ($this->nodeName) { |
||
| 124 | case 'div': |
||
| 125 | $contentScore += 5; |
||
| 126 | break; |
||
| 127 | |||
| 128 | case 'pre': |
||
| 129 | case 'td': |
||
| 130 | case 'blockquote': |
||
| 131 | $contentScore += 3; |
||
| 132 | break; |
||
| 133 | |||
| 134 | case 'address': |
||
| 135 | case 'ol': |
||
| 136 | case 'ul': |
||
| 137 | case 'dl': |
||
| 138 | case 'dd': |
||
| 139 | case 'dt': |
||
| 140 | case 'li': |
||
| 141 | case 'form': |
||
| 142 | $contentScore -= 3; |
||
| 143 | break; |
||
| 144 | |||
| 145 | case 'h1': |
||
| 146 | case 'h2': |
||
| 147 | case 'h3': |
||
| 148 | case 'h4': |
||
| 149 | case 'h5': |
||
| 150 | case 'h6': |
||
| 151 | case 'th': |
||
| 152 | $contentScore -= 5; |
||
| 153 | break; |
||
| 154 | } |
||
| 155 | |||
| 156 | $this->contentScore = $contentScore + ($weightClasses ? $this->getClassWeight() : 0); |
||
| 157 | |||
| 158 | $this->initialized = true; |
||
| 159 | } |
||
| 160 | |||
| 161 | return $this; |
||
| 162 | } |
||
| 163 | |||
| 164 | /** |
||
| 165 | * Override for native getAttribute method. Some nodes have the getAttribute method, some don't, so we need |
||
| 166 | * to check first the existence of the attributes property. |
||
| 167 | * |
||
| 168 | * @param $attributeName string Attribute to retrieve |
||
| 169 | * |
||
| 170 | * @return string |
||
| 171 | */ |
||
| 172 | public function getAttribute($attributeName) |
||
| 173 | { |
||
| 174 | if (!is_null($this->attributes)) { |
||
| 175 | return parent::getAttribute($attributeName); |
||
| 176 | } |
||
| 177 | |||
| 178 | return ''; |
||
| 179 | } |
||
| 180 | |||
| 181 | /** |
||
| 182 | * Override for native hasAttribute. |
||
| 183 | * |
||
| 184 | * @param $attributeName |
||
| 185 | * |
||
| 186 | * @return bool |
||
| 187 | * |
||
| 188 | * @see getAttribute |
||
| 189 | */ |
||
| 190 | public function hasAttribute($attributeName) |
||
| 191 | { |
||
| 192 | if (!is_null($this->attributes)) { |
||
| 193 | return parent::hasAttribute($attributeName); |
||
| 194 | } |
||
| 195 | |||
| 196 | return false; |
||
| 197 | } |
||
| 198 | |||
| 199 | /** |
||
| 200 | * Get the ancestors of the current node. |
||
| 201 | * |
||
| 202 | * @param int|bool $maxLevel Max amount of ancestors to get. False for all of them |
||
| 203 | * |
||
| 204 | * @return array |
||
| 205 | */ |
||
| 206 | public function getNodeAncestors($maxLevel = 3) |
||
| 223 | } |
||
| 224 | |||
| 225 | /** |
||
| 226 | * Returns all links from the current element. |
||
| 227 | * |
||
| 228 | * @return array |
||
| 229 | */ |
||
| 230 | public function getAllLinks() |
||
| 231 | { |
||
| 232 | return iterator_to_array($this->getElementsByTagName('a')); |
||
| 233 | } |
||
| 234 | |||
| 235 | /** |
||
| 236 | * Get the density of links as a percentage of the content |
||
| 237 | * This is the amount of text that is inside a link divided by the total text in the node. |
||
| 238 | * |
||
| 239 | * @return int |
||
| 240 | */ |
||
| 241 | public function getLinkDensity() |
||
| 242 | { |
||
| 243 | $linkLength = 0; |
||
| 244 | $textLength = mb_strlen($this->getTextContent(true)); |
||
| 245 | |||
| 246 | if (!$textLength) { |
||
| 247 | return 0; |
||
| 248 | } |
||
| 249 | |||
| 250 | $links = $this->getAllLinks(); |
||
| 251 | |||
| 252 | if ($links) { |
||
| 253 | /** @var DOMElement $link */ |
||
| 254 | foreach ($links as $link) { |
||
| 255 | $linkLength += mb_strlen($link->getTextContent(true)); |
||
| 256 | } |
||
| 257 | } |
||
| 258 | |||
| 259 | return $linkLength / $textLength; |
||
| 260 | } |
||
| 261 | |||
| 262 | /** |
||
| 263 | * Calculates the weight of the class/id of the current element. |
||
| 264 | * |
||
| 265 | * @return int |
||
| 266 | */ |
||
| 267 | public function getClassWeight() |
||
| 268 | { |
||
| 269 | $weight = 0; |
||
| 270 | |||
| 271 | // Look for a special classname |
||
| 272 | $class = $this->getAttribute('class'); |
||
| 273 | if (trim($class)) { |
||
| 274 | if (preg_match(NodeUtility::$regexps['negative'], $class)) { |
||
| 275 | $weight -= 25; |
||
| 276 | } |
||
| 277 | |||
| 278 | if (preg_match(NodeUtility::$regexps['positive'], $class)) { |
||
| 279 | $weight += 25; |
||
| 280 | } |
||
| 281 | } |
||
| 282 | |||
| 283 | // Look for a special ID |
||
| 284 | $id = $this->getAttribute('id'); |
||
| 285 | if (trim($id)) { |
||
| 286 | if (preg_match(NodeUtility::$regexps['negative'], $id)) { |
||
| 287 | $weight -= 25; |
||
| 288 | } |
||
| 289 | |||
| 290 | if (preg_match(NodeUtility::$regexps['positive'], $id)) { |
||
| 291 | $weight += 25; |
||
| 292 | } |
||
| 293 | } |
||
| 294 | |||
| 295 | return $weight; |
||
| 296 | } |
||
| 297 | |||
| 298 | /** |
||
| 299 | * Returns the full text of the node. |
||
| 300 | * |
||
| 301 | * @param bool $normalize Normalize white space? |
||
| 302 | * |
||
| 303 | * @return string |
||
| 304 | */ |
||
| 305 | public function getTextContent($normalize = false) |
||
| 306 | { |
||
| 307 | $nodeValue = $this->nodeValue; |
||
| 308 | if ($normalize) { |
||
| 309 | $nodeValue = trim(preg_replace('/\s{2,}/', ' ', $nodeValue)); |
||
| 310 | } |
||
| 311 | |||
| 312 | return $nodeValue; |
||
| 313 | } |
||
| 314 | |||
| 315 | /** |
||
| 316 | * Returns the children of the current node. |
||
| 317 | * |
||
| 318 | * @param bool $filterEmptyDOMText Filter empty DOMText nodes? |
||
| 319 | * |
||
| 320 | * @deprecated Use NodeUtility::filterTextNodes, function will be removed in version 3.0 |
||
| 321 | * |
||
| 322 | * @return array |
||
| 323 | */ |
||
| 324 | public function getChildren($filterEmptyDOMText = false) |
||
| 325 | { |
||
| 326 | @trigger_error('getChildren was replaced with NodeUtility::filterTextNodes and will be removed in version 3.0', E_USER_DEPRECATED); |
||
| 327 | |||
| 328 | $ret = iterator_to_array($this->childNodes); |
||
| 329 | if ($filterEmptyDOMText) { |
||
| 330 | // Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number |
||
| 331 | $ret = array_values(array_filter($ret, function($node) { |
||
| 332 | return $node->nodeName !== '#text' || mb_strlen(trim($node->nodeValue)); |
||
| 333 | })); |
||
| 334 | } |
||
| 335 | |||
| 336 | return $ret; |
||
| 337 | } |
||
| 338 | |||
| 339 | /** |
||
| 340 | * Return an array indicating how many rows and columns this table has. |
||
| 341 | * |
||
| 342 | * @return array |
||
| 343 | */ |
||
| 344 | public function getRowAndColumnCount() |
||
| 345 | { |
||
| 346 | $rows = $columns = 0; |
||
| 347 | $trs = $this->getElementsByTagName('tr'); |
||
| 348 | foreach ($trs as $tr) { |
||
| 349 | /** @var \DOMElement $tr */ |
||
| 350 | $rowspan = $tr->getAttribute('rowspan'); |
||
| 351 | $rows += ($rowspan || 1); |
||
| 352 | |||
| 353 | // Now look for column-related info |
||
| 354 | $columnsInThisRow = 0; |
||
| 355 | $cells = $tr->getElementsByTagName('td'); |
||
| 356 | foreach ($cells as $cell) { |
||
| 357 | /** @var \DOMElement $cell */ |
||
| 358 | $colspan = $cell->getAttribute('colspan'); |
||
| 359 | $columnsInThisRow += ($colspan || 1); |
||
| 360 | } |
||
| 361 | $columns = max($columns, $columnsInThisRow); |
||
| 362 | } |
||
| 363 | |||
| 364 | return ['rows' => $rows, 'columns' => $columns]; |
||
| 365 | } |
||
| 366 | |||
| 367 | /** |
||
| 368 | * Creates a new node based on the text content of the original node. |
||
| 369 | * |
||
| 370 | * @param $originalNode DOMNode |
||
| 371 | * @param $tagName string |
||
| 372 | * |
||
| 373 | * @return DOMElement |
||
| 374 | */ |
||
| 375 | public function createNode($originalNode, $tagName) |
||
| 381 | } |
||
| 382 | |||
| 383 | /** |
||
| 384 | * Check if a given node has one of its ancestor tag name matching the |
||
| 385 | * provided one. |
||
| 386 | * |
||
| 387 | * @param string $tagName |
||
| 388 | * @param int $maxDepth |
||
| 389 | * @param callable $filterFn |
||
| 390 | * |
||
| 391 | * @return bool |
||
| 392 | */ |
||
| 393 | public function hasAncestorTag($tagName, $maxDepth = 3, callable $filterFn = null) |
||
| 394 | { |
||
| 395 | $depth = 0; |
||
| 396 | $node = $this; |
||
| 397 | |||
| 398 | while ($node->parentNode) { |
||
| 399 | if ($maxDepth > 0 && $depth > $maxDepth) { |
||
| 400 | return false; |
||
| 401 | } |
||
| 402 | |||
| 403 | if ($node->parentNode->nodeName === $tagName && (!$filterFn || $filterFn($node->parentNode))) { |
||
| 404 | return true; |
||
| 405 | } |
||
| 406 | |||
| 407 | $node = $node->parentNode; |
||
| 408 | $depth++; |
||
| 409 | } |
||
| 410 | |||
| 411 | return false; |
||
| 412 | } |
||
| 413 | |||
| 414 | /** |
||
| 415 | * Check if this node has only whitespace and a single element with given tag |
||
| 416 | * or if it contains no element with given tag or more than 1 element. |
||
| 417 | * |
||
| 418 | * @param $tag string Name of tag |
||
| 419 | * |
||
| 420 | * @return bool |
||
| 421 | */ |
||
| 422 | public function hasSingleTagInsideElement($tag) |
||
| 437 | }); |
||
| 438 | } |
||
| 439 | |||
| 440 | /** |
||
| 441 | * Check if the current element has a single child block element. |
||
| 442 | * Block elements are the ones defined in the divToPElements array. |
||
| 443 | * |
||
| 444 | * @return bool |
||
| 445 | */ |
||
| 446 | public function hasSingleChildBlockElement() |
||
| 447 | { |
||
| 448 | $result = false; |
||
| 449 | if ($this->hasChildNodes()) { |
||
| 450 | foreach ($this->childNodes as $child) { |
||
| 451 | if (in_array($child->nodeName, $this->divToPElements)) { |
||
| 452 | $result = true; |
||
| 453 | } else { |
||
| 454 | // If any of the hasSingleChildBlockElement calls return true, return true then. |
||
| 455 | /** @var $child DOMElement */ |
||
| 456 | $result = ($result || $child->hasSingleChildBlockElement()); |
||
| 457 | } |
||
| 458 | } |
||
| 459 | } |
||
| 460 | |||
| 461 | return $result; |
||
| 462 | } |
||
| 463 | |||
| 464 | /** |
||
| 465 | * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace. |
||
| 466 | * |
||
| 467 | * @return bool |
||
| 468 | */ |
||
| 469 | public function isElementWithoutContent() |
||
| 470 | { |
||
| 471 | return $this instanceof DOMElement && |
||
| 472 | mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 && |
||
| 473 | ($this->childNodes->length === 0 || |
||
| 474 | $this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length |
||
| 475 | /* |
||
| 476 | * Special PHP DOMDocument case: We also need to count how many DOMText we have inside the node. |
||
| 477 | * If there's an empty tag with an space inside and a BR (for example "<p> <br/></p>) counting only BRs and |
||
| 478 | * HRs will will say that the example has 2 nodes, instead of one. This happens because in DOMDocument, |
||
| 479 | * DOMTexts are also nodes (which doesn't happen in JS). So we need to also count how many DOMText we |
||
| 480 | * are dealing with (And at this point we know they are empty or are just whitespace, because of the |
||
| 481 | * mb_strlen in this chain of checks). |
||
| 482 | */ |
||
| 483 | + count(array_filter(iterator_to_array($this->childNodes), function($child) { |
||
| 484 | return $child instanceof DOMText; |
||
| 485 | })) |
||
| 486 | |||
| 487 | ); |
||
| 488 | } |
||
| 489 | |||
| 490 | /** |
||
| 491 | * Determine if a node qualifies as phrasing content. |
||
| 492 | * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content. |
||
| 493 | * |
||
| 494 | * @return bool |
||
| 495 | */ |
||
| 496 | public function isPhrasingContent() |
||
| 504 | ); |
||
| 505 | } |
||
| 506 | |||
| 507 | /** |
||
| 508 | * In the original JS project they check if the node has the style display=none, which unfortunately |
||
| 509 | * in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none". |
||
| 510 | * |
||
| 511 | * Might be a good idea to check for classes or other attributes like 'aria-hidden' |
||
| 512 | * |
||
| 513 | * @return bool |
||
| 514 | */ |
||
| 515 | public function isProbablyVisible() |
||
| 518 | } |
||
| 519 | |||
| 520 | /** |
||
| 521 | * @return bool |
||
| 522 | */ |
||
| 523 | public function isWhitespace() |
||
| 524 | { |
||
| 525 | return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) || |
||
| 526 | ($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br'); |
||
| 527 | } |
||
| 528 | |||
| 529 | /** |
||
| 530 | * This is a hack that overcomes the issue of node shifting when scanning and removing nodes. |
||
| 531 | * |
||
| 532 | * In the JS version of getElementsByTagName, if you remove a node it will not appear during the |
||
| 533 | * foreach. This does not happen in PHP DOMDocument, because if you remove a node, it will still appear but as an |
||
| 534 | * orphan node and will give an exception if you try to do anything with it. |
||
| 535 | * |
||
| 536 | * Shifting also occurs when converting parent nodes (like a P to a DIV), which in that case the found nodes are |
||
| 537 | * removed from the foreach "pool" but the internal index of the foreach is not aware and skips over nodes that |
||
| 538 | * never looped over. (index is at position 5, 2 nodes are removed, next one should be node 3, but the foreach tries |
||
| 539 | * to access node 6) |
||
| 540 | * |
||
| 541 | * This function solves this by searching for the nodes on every loop and keeping track of the count differences. |
||
| 542 | * Because on every loop we call getElementsByTagName again, this could cause a performance impact and should be |
||
| 543 | * used only when the results of the search are going to be used to remove the nodes. |
||
| 544 | * |
||
| 545 | * @param string $tag |
||
| 546 | * |
||
| 547 | * @return \Generator |
||
| 548 | */ |
||
| 549 | public function shiftingAwareGetElementsByTagName($tag) |
||
| 566 | } |
||
| 567 | } |
||
| 568 | |||
| 569 | /** |
||
| 570 | * Mimics JS's firstElementChild property. PHP only has firstChild which could be any type of DOMNode. Use this |
||
| 571 | * function to get the first one that is an DOMElement node. |
||
| 572 | * |
||
| 573 | * @return \DOMElement|null |
||
| 574 | */ |
||
| 575 | public function getFirstElementChild() |
||
| 586 | } |
||
| 587 | } |
||
| 588 |