@@ -6,5 +6,5 @@ |
||
| 6 | 6 | |
| 7 | 7 | class DOMEntityReference extends \DOMEntityReference |
| 8 | 8 | { |
| 9 | - use NodeTrait; |
|
| 9 | + use NodeTrait; |
|
| 10 | 10 | } |
@@ -6,5 +6,5 @@ |
||
| 6 | 6 | |
| 7 | 7 | class DOMNotation extends \DOMNotation |
| 8 | 8 | { |
| 9 | - use NodeTrait; |
|
| 9 | + use NodeTrait; |
|
| 10 | 10 | } |
@@ -6,5 +6,5 @@ |
||
| 6 | 6 | |
| 7 | 7 | class DOMComment extends \DOMComment |
| 8 | 8 | { |
| 9 | - use NodeTrait; |
|
| 9 | + use NodeTrait; |
|
| 10 | 10 | } |
@@ -6,5 +6,5 @@ |
||
| 6 | 6 | |
| 7 | 7 | class DOMDocumentFragment extends \DOMDocumentFragment |
| 8 | 8 | { |
| 9 | - use NodeTrait; |
|
| 9 | + use NodeTrait; |
|
| 10 | 10 | } |
@@ -14,69 +14,69 @@ |
||
| 14 | 14 | */ |
| 15 | 15 | class DOMNodeList implements \Countable, \IteratorAggregate |
| 16 | 16 | { |
| 17 | - /** |
|
| 18 | - * @var array |
|
| 19 | - */ |
|
| 20 | - protected $items = []; |
|
| 17 | + /** |
|
| 18 | + * @var array |
|
| 19 | + */ |
|
| 20 | + protected $items = []; |
|
| 21 | 21 | |
| 22 | - /** |
|
| 23 | - * @var int |
|
| 24 | - */ |
|
| 25 | - protected $length = 0; |
|
| 22 | + /** |
|
| 23 | + * @var int |
|
| 24 | + */ |
|
| 25 | + protected $length = 0; |
|
| 26 | 26 | |
| 27 | - /** |
|
| 28 | - * To allow access to length in the same way that DOMNodeList allows. |
|
| 29 | - * |
|
| 30 | - * {@inheritdoc} |
|
| 31 | - */ |
|
| 32 | - public function __get($name) |
|
| 33 | - { |
|
| 34 | - switch ($name) { |
|
| 35 | - case 'length': |
|
| 36 | - return $this->length; |
|
| 37 | - default: |
|
| 38 | - trigger_error(sprintf('Undefined property: %s::%s', static::class, $name)); |
|
| 39 | - } |
|
| 40 | - } |
|
| 27 | + /** |
|
| 28 | + * To allow access to length in the same way that DOMNodeList allows. |
|
| 29 | + * |
|
| 30 | + * {@inheritdoc} |
|
| 31 | + */ |
|
| 32 | + public function __get($name) |
|
| 33 | + { |
|
| 34 | + switch ($name) { |
|
| 35 | + case 'length': |
|
| 36 | + return $this->length; |
|
| 37 | + default: |
|
| 38 | + trigger_error(sprintf('Undefined property: %s::%s', static::class, $name)); |
|
| 39 | + } |
|
| 40 | + } |
|
| 41 | 41 | |
| 42 | - /** |
|
| 43 | - * @param DOMNode|DOMElement|DOMComment $node |
|
| 44 | - * |
|
| 45 | - * @return DOMNodeList |
|
| 46 | - */ |
|
| 47 | - public function add($node) |
|
| 48 | - { |
|
| 49 | - $this->items[] = $node; |
|
| 50 | - $this->length++; |
|
| 42 | + /** |
|
| 43 | + * @param DOMNode|DOMElement|DOMComment $node |
|
| 44 | + * |
|
| 45 | + * @return DOMNodeList |
|
| 46 | + */ |
|
| 47 | + public function add($node) |
|
| 48 | + { |
|
| 49 | + $this->items[] = $node; |
|
| 50 | + $this->length++; |
|
| 51 | 51 | |
| 52 | - return $this; |
|
| 53 | - } |
|
| 52 | + return $this; |
|
| 53 | + } |
|
| 54 | 54 | |
| 55 | - /** |
|
| 56 | - * @param int $offset |
|
| 57 | - * |
|
| 58 | - * @return DOMNode|DOMElement|DOMComment |
|
| 59 | - */ |
|
| 60 | - public function item(int $offset) |
|
| 61 | - { |
|
| 62 | - return $this->items[$offset]; |
|
| 63 | - } |
|
| 55 | + /** |
|
| 56 | + * @param int $offset |
|
| 57 | + * |
|
| 58 | + * @return DOMNode|DOMElement|DOMComment |
|
| 59 | + */ |
|
| 60 | + public function item(int $offset) |
|
| 61 | + { |
|
| 62 | + return $this->items[$offset]; |
|
| 63 | + } |
|
| 64 | 64 | |
| 65 | - /** |
|
| 66 | - * @return int|void |
|
| 67 | - */ |
|
| 68 | - public function count(): int |
|
| 69 | - { |
|
| 70 | - return $this->length; |
|
| 71 | - } |
|
| 65 | + /** |
|
| 66 | + * @return int|void |
|
| 67 | + */ |
|
| 68 | + public function count(): int |
|
| 69 | + { |
|
| 70 | + return $this->length; |
|
| 71 | + } |
|
| 72 | 72 | |
| 73 | - /** |
|
| 74 | - * To make it compatible with iterator_to_array() function. |
|
| 75 | - * |
|
| 76 | - * {@inheritdoc} |
|
| 77 | - */ |
|
| 78 | - public function getIterator(): \ArrayIterator |
|
| 79 | - { |
|
| 80 | - return new \ArrayIterator($this->items); |
|
| 81 | - } |
|
| 73 | + /** |
|
| 74 | + * To make it compatible with iterator_to_array() function. |
|
| 75 | + * |
|
| 76 | + * {@inheritdoc} |
|
| 77 | + */ |
|
| 78 | + public function getIterator(): \ArrayIterator |
|
| 79 | + { |
|
| 80 | + return new \ArrayIterator($this->items); |
|
| 81 | + } |
|
| 82 | 82 | } |
@@ -6,25 +6,25 @@ |
||
| 6 | 6 | |
| 7 | 7 | class DOMDocument extends \DOMDocument |
| 8 | 8 | { |
| 9 | - use NodeTrait; |
|
| 9 | + use NodeTrait; |
|
| 10 | 10 | |
| 11 | - public function __construct($version, $encoding) |
|
| 12 | - { |
|
| 13 | - parent::__construct($version, $encoding); |
|
| 11 | + public function __construct($version, $encoding) |
|
| 12 | + { |
|
| 13 | + parent::__construct($version, $encoding); |
|
| 14 | 14 | |
| 15 | - $this->registerNodeClass('DOMAttr', DOMAttr::class); |
|
| 16 | - $this->registerNodeClass('DOMCdataSection', DOMCdataSection::class); |
|
| 17 | - $this->registerNodeClass('DOMCharacterData', DOMCharacterData::class); |
|
| 18 | - $this->registerNodeClass('DOMComment', DOMComment::class); |
|
| 19 | - $this->registerNodeClass('DOMDocument', self::class); |
|
| 20 | - $this->registerNodeClass('DOMDocumentFragment', DOMDocumentFragment::class); |
|
| 21 | - $this->registerNodeClass('DOMDocumentType', DOMDocumentType::class); |
|
| 22 | - $this->registerNodeClass('DOMElement', DOMElement::class); |
|
| 23 | - $this->registerNodeClass('DOMEntity', DOMEntity::class); |
|
| 24 | - $this->registerNodeClass('DOMEntityReference', DOMEntityReference::class); |
|
| 25 | - $this->registerNodeClass('DOMNode', DOMNode::class); |
|
| 26 | - $this->registerNodeClass('DOMNotation', DOMNotation::class); |
|
| 27 | - $this->registerNodeClass('DOMProcessingInstruction', DOMProcessingInstruction::class); |
|
| 28 | - $this->registerNodeClass('DOMText', DOMText::class); |
|
| 29 | - } |
|
| 15 | + $this->registerNodeClass('DOMAttr', DOMAttr::class); |
|
| 16 | + $this->registerNodeClass('DOMCdataSection', DOMCdataSection::class); |
|
| 17 | + $this->registerNodeClass('DOMCharacterData', DOMCharacterData::class); |
|
| 18 | + $this->registerNodeClass('DOMComment', DOMComment::class); |
|
| 19 | + $this->registerNodeClass('DOMDocument', self::class); |
|
| 20 | + $this->registerNodeClass('DOMDocumentFragment', DOMDocumentFragment::class); |
|
| 21 | + $this->registerNodeClass('DOMDocumentType', DOMDocumentType::class); |
|
| 22 | + $this->registerNodeClass('DOMElement', DOMElement::class); |
|
| 23 | + $this->registerNodeClass('DOMEntity', DOMEntity::class); |
|
| 24 | + $this->registerNodeClass('DOMEntityReference', DOMEntityReference::class); |
|
| 25 | + $this->registerNodeClass('DOMNode', DOMNode::class); |
|
| 26 | + $this->registerNodeClass('DOMNotation', DOMNotation::class); |
|
| 27 | + $this->registerNodeClass('DOMProcessingInstruction', DOMProcessingInstruction::class); |
|
| 28 | + $this->registerNodeClass('DOMText', DOMText::class); |
|
| 29 | + } |
|
| 30 | 30 | } |
@@ -6,5 +6,5 @@ |
||
| 6 | 6 | |
| 7 | 7 | class DOMEntity extends \DOMEntity |
| 8 | 8 | { |
| 9 | - use NodeTrait; |
|
| 9 | + use NodeTrait; |
|
| 10 | 10 | } |
@@ -12,126 +12,126 @@ discard block |
||
| 12 | 12 | */ |
| 13 | 13 | class NodeUtility |
| 14 | 14 | { |
| 15 | - /** |
|
| 16 | - * Collection of regexps to check the node usability. |
|
| 17 | - * |
|
| 18 | - * @var array |
|
| 19 | - */ |
|
| 20 | - public static $regexps = [ |
|
| 21 | - 'unlikelyCandidates' => '/-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i', |
|
| 22 | - 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', |
|
| 23 | - 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i', |
|
| 24 | - 'byline' => '/byline|author|dateline|writtenby|p-author/i', |
|
| 25 | - 'replaceFonts' => '/<(\/?)font[^>]*>/gi', |
|
| 26 | - 'normalize' => '/\s{2,}/', |
|
| 27 | - 'videos' => '/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i', |
|
| 28 | - 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i', |
|
| 29 | - 'prevLink' => '/(prev|earl|old|new|<|«)/i', |
|
| 30 | - 'whitespace' => '/^\s*$/', |
|
| 31 | - 'hasContent' => '/\S$/', |
|
| 32 | - 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i', |
|
| 33 | - 'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i', |
|
| 34 | - // \x{00A0} is the unicode version of |
|
| 35 | - 'onlyWhitespace' => '/\x{00A0}|\s+/u' |
|
| 36 | - ]; |
|
| 37 | - |
|
| 38 | - /** |
|
| 39 | - * Imported from the Element class on league\html-to-markdown. |
|
| 40 | - * |
|
| 41 | - * @param $node |
|
| 42 | - * |
|
| 43 | - * @return DOMElement |
|
| 44 | - */ |
|
| 45 | - public static function nextElement($node) |
|
| 46 | - { |
|
| 47 | - $next = $node; |
|
| 48 | - while ($next |
|
| 49 | - && $next->nodeType !== XML_ELEMENT_NODE |
|
| 50 | - && $next->isWhitespace()) { |
|
| 51 | - $next = $next->nextSibling; |
|
| 52 | - } |
|
| 53 | - |
|
| 54 | - return $next; |
|
| 55 | - } |
|
| 56 | - |
|
| 57 | - /** |
|
| 58 | - * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new |
|
| 59 | - * element with the new tag name and importing it to the main DOMDocument. |
|
| 60 | - * |
|
| 61 | - * @param DOMNode $node |
|
| 62 | - * @param string $value |
|
| 63 | - * @param bool $importAttributes |
|
| 64 | - * |
|
| 65 | - * @return DOMNode |
|
| 66 | - */ |
|
| 67 | - public static function setNodeTag($node, $value, $importAttributes = true) |
|
| 68 | - { |
|
| 69 | - $new = new DOMDocument('1.0', 'utf-8'); |
|
| 70 | - $new->appendChild($new->createElement($value)); |
|
| 71 | - |
|
| 72 | - $children = $node->childNodes; |
|
| 73 | - /** @var $children \DOMNodeList $i */ |
|
| 74 | - for ($i = 0; $i < $children->length; $i++) { |
|
| 75 | - $import = $new->importNode($children->item($i), true); |
|
| 76 | - $new->firstChild->appendChild($import); |
|
| 77 | - } |
|
| 78 | - |
|
| 79 | - if ($importAttributes) { |
|
| 80 | - // Import attributes from the original node. |
|
| 81 | - foreach ($node->attributes as $attribute) { |
|
| 82 | - $new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue); |
|
| 83 | - } |
|
| 84 | - } |
|
| 85 | - |
|
| 86 | - // The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement. |
|
| 87 | - $import = $node->ownerDocument->importNode($new->firstChild, true); |
|
| 88 | - $node->parentNode->replaceChild($import, $node); |
|
| 89 | - |
|
| 90 | - return $import; |
|
| 91 | - } |
|
| 92 | - |
|
| 93 | - /** |
|
| 94 | - * Removes the current node and returns the next node to be parsed (child, sibling or parent). |
|
| 95 | - * |
|
| 96 | - * @param DOMNode $node |
|
| 97 | - * |
|
| 98 | - * @return DOMNode |
|
| 99 | - */ |
|
| 100 | - public static function removeAndGetNext($node) |
|
| 101 | - { |
|
| 102 | - $nextNode = self::getNextNode($node, true); |
|
| 103 | - $node->parentNode->removeChild($node); |
|
| 104 | - |
|
| 105 | - return $nextNode; |
|
| 106 | - } |
|
| 107 | - |
|
| 108 | - /** |
|
| 109 | - * Remove the selected node. |
|
| 110 | - * |
|
| 111 | - * @param $node DOMElement |
|
| 112 | - * |
|
| 113 | - * @return void |
|
| 114 | - **/ |
|
| 115 | - public static function removeNode($node) |
|
| 116 | - { |
|
| 117 | - $parent = $node->parentNode; |
|
| 118 | - if ($parent) { |
|
| 119 | - $parent->removeChild($node); |
|
| 120 | - } |
|
| 121 | - } |
|
| 122 | - |
|
| 123 | - /** |
|
| 124 | - * Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally |
|
| 125 | - * for parents. |
|
| 126 | - * |
|
| 127 | - * @param DOMNode $originalNode |
|
| 128 | - * @param bool $ignoreSelfAndKids |
|
| 129 | - * |
|
| 130 | - * @return DOMNode |
|
| 131 | - */ |
|
| 132 | - public static function getNextNode($originalNode, $ignoreSelfAndKids = false) |
|
| 133 | - { |
|
| 134 | - /* |
|
| 15 | + /** |
|
| 16 | + * Collection of regexps to check the node usability. |
|
| 17 | + * |
|
| 18 | + * @var array |
|
| 19 | + */ |
|
| 20 | + public static $regexps = [ |
|
| 21 | + 'unlikelyCandidates' => '/-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i', |
|
| 22 | + 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', |
|
| 23 | + 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i', |
|
| 24 | + 'byline' => '/byline|author|dateline|writtenby|p-author/i', |
|
| 25 | + 'replaceFonts' => '/<(\/?)font[^>]*>/gi', |
|
| 26 | + 'normalize' => '/\s{2,}/', |
|
| 27 | + 'videos' => '/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i', |
|
| 28 | + 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i', |
|
| 29 | + 'prevLink' => '/(prev|earl|old|new|<|«)/i', |
|
| 30 | + 'whitespace' => '/^\s*$/', |
|
| 31 | + 'hasContent' => '/\S$/', |
|
| 32 | + 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i', |
|
| 33 | + 'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i', |
|
| 34 | + // \x{00A0} is the unicode version of |
|
| 35 | + 'onlyWhitespace' => '/\x{00A0}|\s+/u' |
|
| 36 | + ]; |
|
| 37 | + |
|
| 38 | + /** |
|
| 39 | + * Imported from the Element class on league\html-to-markdown. |
|
| 40 | + * |
|
| 41 | + * @param $node |
|
| 42 | + * |
|
| 43 | + * @return DOMElement |
|
| 44 | + */ |
|
| 45 | + public static function nextElement($node) |
|
| 46 | + { |
|
| 47 | + $next = $node; |
|
| 48 | + while ($next |
|
| 49 | + && $next->nodeType !== XML_ELEMENT_NODE |
|
| 50 | + && $next->isWhitespace()) { |
|
| 51 | + $next = $next->nextSibling; |
|
| 52 | + } |
|
| 53 | + |
|
| 54 | + return $next; |
|
| 55 | + } |
|
| 56 | + |
|
| 57 | + /** |
|
| 58 | + * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new |
|
| 59 | + * element with the new tag name and importing it to the main DOMDocument. |
|
| 60 | + * |
|
| 61 | + * @param DOMNode $node |
|
| 62 | + * @param string $value |
|
| 63 | + * @param bool $importAttributes |
|
| 64 | + * |
|
| 65 | + * @return DOMNode |
|
| 66 | + */ |
|
| 67 | + public static function setNodeTag($node, $value, $importAttributes = true) |
|
| 68 | + { |
|
| 69 | + $new = new DOMDocument('1.0', 'utf-8'); |
|
| 70 | + $new->appendChild($new->createElement($value)); |
|
| 71 | + |
|
| 72 | + $children = $node->childNodes; |
|
| 73 | + /** @var $children \DOMNodeList $i */ |
|
| 74 | + for ($i = 0; $i < $children->length; $i++) { |
|
| 75 | + $import = $new->importNode($children->item($i), true); |
|
| 76 | + $new->firstChild->appendChild($import); |
|
| 77 | + } |
|
| 78 | + |
|
| 79 | + if ($importAttributes) { |
|
| 80 | + // Import attributes from the original node. |
|
| 81 | + foreach ($node->attributes as $attribute) { |
|
| 82 | + $new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue); |
|
| 83 | + } |
|
| 84 | + } |
|
| 85 | + |
|
| 86 | + // The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement. |
|
| 87 | + $import = $node->ownerDocument->importNode($new->firstChild, true); |
|
| 88 | + $node->parentNode->replaceChild($import, $node); |
|
| 89 | + |
|
| 90 | + return $import; |
|
| 91 | + } |
|
| 92 | + |
|
| 93 | + /** |
|
| 94 | + * Removes the current node and returns the next node to be parsed (child, sibling or parent). |
|
| 95 | + * |
|
| 96 | + * @param DOMNode $node |
|
| 97 | + * |
|
| 98 | + * @return DOMNode |
|
| 99 | + */ |
|
| 100 | + public static function removeAndGetNext($node) |
|
| 101 | + { |
|
| 102 | + $nextNode = self::getNextNode($node, true); |
|
| 103 | + $node->parentNode->removeChild($node); |
|
| 104 | + |
|
| 105 | + return $nextNode; |
|
| 106 | + } |
|
| 107 | + |
|
| 108 | + /** |
|
| 109 | + * Remove the selected node. |
|
| 110 | + * |
|
| 111 | + * @param $node DOMElement |
|
| 112 | + * |
|
| 113 | + * @return void |
|
| 114 | + **/ |
|
| 115 | + public static function removeNode($node) |
|
| 116 | + { |
|
| 117 | + $parent = $node->parentNode; |
|
| 118 | + if ($parent) { |
|
| 119 | + $parent->removeChild($node); |
|
| 120 | + } |
|
| 121 | + } |
|
| 122 | + |
|
| 123 | + /** |
|
| 124 | + * Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally |
|
| 125 | + * for parents. |
|
| 126 | + * |
|
| 127 | + * @param DOMNode $originalNode |
|
| 128 | + * @param bool $ignoreSelfAndKids |
|
| 129 | + * |
|
| 130 | + * @return DOMNode |
|
| 131 | + */ |
|
| 132 | + public static function getNextNode($originalNode, $ignoreSelfAndKids = false) |
|
| 133 | + { |
|
| 134 | + /* |
|
| 135 | 135 | * Traverse the DOM from node to node, starting at the node passed in. |
| 136 | 136 | * Pass true for the second parameter to indicate this node itself |
| 137 | 137 | * (and its kids) are going away, and we want the next node over. |
@@ -139,42 +139,42 @@ discard block |
||
| 139 | 139 | * Calling this in a loop will traverse the DOM depth-first. |
| 140 | 140 | */ |
| 141 | 141 | |
| 142 | - // First check for kids if those aren't being ignored |
|
| 143 | - if (!$ignoreSelfAndKids && $originalNode->firstChild) { |
|
| 144 | - return $originalNode->firstChild; |
|
| 145 | - } |
|
| 146 | - |
|
| 147 | - // Then for siblings... |
|
| 148 | - if ($originalNode->nextSibling) { |
|
| 149 | - return $originalNode->nextSibling; |
|
| 150 | - } |
|
| 151 | - |
|
| 152 | - // And finally, move up the parent chain *and* find a sibling |
|
| 153 | - // (because this is depth-first traversal, we will have already |
|
| 154 | - // seen the parent nodes themselves). |
|
| 155 | - do { |
|
| 156 | - $originalNode = $originalNode->parentNode; |
|
| 157 | - } while ($originalNode && !$originalNode->nextSibling); |
|
| 158 | - |
|
| 159 | - return ($originalNode) ? $originalNode->nextSibling : $originalNode; |
|
| 160 | - } |
|
| 161 | - |
|
| 162 | - /** |
|
| 163 | - * Remove all empty DOMNodes from DOMNodeLists. |
|
| 164 | - * |
|
| 165 | - * @param \DOMNodeList $list |
|
| 166 | - * |
|
| 167 | - * @return DOMNodeList |
|
| 168 | - */ |
|
| 169 | - public static function filterTextNodes(\DOMNodeList $list) |
|
| 170 | - { |
|
| 171 | - $newList = new DOMNodeList(); |
|
| 172 | - foreach ($list as $node) { |
|
| 173 | - if ($node->nodeType !== XML_TEXT_NODE || mb_strlen(trim($node->nodeValue))) { |
|
| 174 | - $newList->add($node); |
|
| 175 | - } |
|
| 176 | - } |
|
| 177 | - |
|
| 178 | - return $newList; |
|
| 179 | - } |
|
| 142 | + // First check for kids if those aren't being ignored |
|
| 143 | + if (!$ignoreSelfAndKids && $originalNode->firstChild) { |
|
| 144 | + return $originalNode->firstChild; |
|
| 145 | + } |
|
| 146 | + |
|
| 147 | + // Then for siblings... |
|
| 148 | + if ($originalNode->nextSibling) { |
|
| 149 | + return $originalNode->nextSibling; |
|
| 150 | + } |
|
| 151 | + |
|
| 152 | + // And finally, move up the parent chain *and* find a sibling |
|
| 153 | + // (because this is depth-first traversal, we will have already |
|
| 154 | + // seen the parent nodes themselves). |
|
| 155 | + do { |
|
| 156 | + $originalNode = $originalNode->parentNode; |
|
| 157 | + } while ($originalNode && !$originalNode->nextSibling); |
|
| 158 | + |
|
| 159 | + return ($originalNode) ? $originalNode->nextSibling : $originalNode; |
|
| 160 | + } |
|
| 161 | + |
|
| 162 | + /** |
|
| 163 | + * Remove all empty DOMNodes from DOMNodeLists. |
|
| 164 | + * |
|
| 165 | + * @param \DOMNodeList $list |
|
| 166 | + * |
|
| 167 | + * @return DOMNodeList |
|
| 168 | + */ |
|
| 169 | + public static function filterTextNodes(\DOMNodeList $list) |
|
| 170 | + { |
|
| 171 | + $newList = new DOMNodeList(); |
|
| 172 | + foreach ($list as $node) { |
|
| 173 | + if ($node->nodeType !== XML_TEXT_NODE || mb_strlen(trim($node->nodeValue))) { |
|
| 174 | + $newList->add($node); |
|
| 175 | + } |
|
| 176 | + } |
|
| 177 | + |
|
| 178 | + return $newList; |
|
| 179 | + } |
|
| 180 | 180 | } |
@@ -13,75 +13,75 @@ discard block |
||
| 13 | 13 | */ |
| 14 | 14 | trait NodeTrait |
| 15 | 15 | { |
| 16 | - /** |
|
| 17 | - * Content score of the node. Used to determine the value of the content. |
|
| 18 | - * |
|
| 19 | - * @var int |
|
| 20 | - */ |
|
| 21 | - public $contentScore = 0; |
|
| 22 | - |
|
| 23 | - /** |
|
| 24 | - * Flag for initialized status. |
|
| 25 | - * |
|
| 26 | - * @var bool |
|
| 27 | - */ |
|
| 28 | - private $initialized = false; |
|
| 29 | - |
|
| 30 | - /** |
|
| 31 | - * Flag data tables. |
|
| 32 | - * |
|
| 33 | - * @var bool |
|
| 34 | - */ |
|
| 35 | - private $readabilityDataTable = false; |
|
| 36 | - |
|
| 37 | - /** |
|
| 38 | - * @var array |
|
| 39 | - */ |
|
| 40 | - private $divToPElements = [ |
|
| 41 | - 'a', |
|
| 42 | - 'blockquote', |
|
| 43 | - 'dl', |
|
| 44 | - 'div', |
|
| 45 | - 'img', |
|
| 46 | - 'ol', |
|
| 47 | - 'p', |
|
| 48 | - 'pre', |
|
| 49 | - 'table', |
|
| 50 | - 'ul', |
|
| 51 | - 'select', |
|
| 52 | - ]; |
|
| 53 | - |
|
| 54 | - /** |
|
| 55 | - * The commented out elements qualify as phrasing content but tend to be |
|
| 56 | - * removed by readability when put into paragraphs, so we ignore them here. |
|
| 57 | - * |
|
| 58 | - * @var array |
|
| 59 | - */ |
|
| 60 | - private $phrasing_elems = [ |
|
| 61 | - // 'CANVAS', 'IFRAME', 'SVG', 'VIDEO', |
|
| 62 | - 'abbr', 'audio', 'b', 'bdo', 'br', 'button', 'cite', 'code', 'data', |
|
| 63 | - 'datalist', 'dfn', 'em', 'embed', 'i', 'img', 'input', 'kbd', 'label', |
|
| 64 | - 'mark', 'math', 'meter', 'noscript', 'object', 'output', 'progress', 'q', |
|
| 65 | - 'ruby', 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', |
|
| 66 | - 'sup', 'textarea', 'time', 'var', 'wbr' |
|
| 67 | - ]; |
|
| 68 | - |
|
| 69 | - /** |
|
| 70 | - * initialized getter. |
|
| 71 | - * |
|
| 72 | - * @return bool |
|
| 73 | - */ |
|
| 74 | - public function isInitialized() |
|
| 75 | - { |
|
| 76 | - return $this->initialized; |
|
| 77 | - } |
|
| 78 | - |
|
| 79 | - /** |
|
| 80 | - * @return bool |
|
| 81 | - */ |
|
| 82 | - public function isReadabilityDataTable() |
|
| 83 | - { |
|
| 84 | - /* |
|
| 16 | + /** |
|
| 17 | + * Content score of the node. Used to determine the value of the content. |
|
| 18 | + * |
|
| 19 | + * @var int |
|
| 20 | + */ |
|
| 21 | + public $contentScore = 0; |
|
| 22 | + |
|
| 23 | + /** |
|
| 24 | + * Flag for initialized status. |
|
| 25 | + * |
|
| 26 | + * @var bool |
|
| 27 | + */ |
|
| 28 | + private $initialized = false; |
|
| 29 | + |
|
| 30 | + /** |
|
| 31 | + * Flag data tables. |
|
| 32 | + * |
|
| 33 | + * @var bool |
|
| 34 | + */ |
|
| 35 | + private $readabilityDataTable = false; |
|
| 36 | + |
|
| 37 | + /** |
|
| 38 | + * @var array |
|
| 39 | + */ |
|
| 40 | + private $divToPElements = [ |
|
| 41 | + 'a', |
|
| 42 | + 'blockquote', |
|
| 43 | + 'dl', |
|
| 44 | + 'div', |
|
| 45 | + 'img', |
|
| 46 | + 'ol', |
|
| 47 | + 'p', |
|
| 48 | + 'pre', |
|
| 49 | + 'table', |
|
| 50 | + 'ul', |
|
| 51 | + 'select', |
|
| 52 | + ]; |
|
| 53 | + |
|
| 54 | + /** |
|
| 55 | + * The commented out elements qualify as phrasing content but tend to be |
|
| 56 | + * removed by readability when put into paragraphs, so we ignore them here. |
|
| 57 | + * |
|
| 58 | + * @var array |
|
| 59 | + */ |
|
| 60 | + private $phrasing_elems = [ |
|
| 61 | + // 'CANVAS', 'IFRAME', 'SVG', 'VIDEO', |
|
| 62 | + 'abbr', 'audio', 'b', 'bdo', 'br', 'button', 'cite', 'code', 'data', |
|
| 63 | + 'datalist', 'dfn', 'em', 'embed', 'i', 'img', 'input', 'kbd', 'label', |
|
| 64 | + 'mark', 'math', 'meter', 'noscript', 'object', 'output', 'progress', 'q', |
|
| 65 | + 'ruby', 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', |
|
| 66 | + 'sup', 'textarea', 'time', 'var', 'wbr' |
|
| 67 | + ]; |
|
| 68 | + |
|
| 69 | + /** |
|
| 70 | + * initialized getter. |
|
| 71 | + * |
|
| 72 | + * @return bool |
|
| 73 | + */ |
|
| 74 | + public function isInitialized() |
|
| 75 | + { |
|
| 76 | + return $this->initialized; |
|
| 77 | + } |
|
| 78 | + |
|
| 79 | + /** |
|
| 80 | + * @return bool |
|
| 81 | + */ |
|
| 82 | + public function isReadabilityDataTable() |
|
| 83 | + { |
|
| 84 | + /* |
|
| 85 | 85 | * This is a workaround that I'd like to remove in the future. |
| 86 | 86 | * Seems that although we are extending the base DOMElement and adding custom properties (like this one, |
| 87 | 87 | * 'readabilityDataTable'), these properties get lost when you search for elements with getElementsByTagName. |
@@ -91,388 +91,388 @@ discard block |
||
| 91 | 91 | * |
| 92 | 92 | * @see https://stackoverflow.com/questions/35654709/php-registernodeclass-and-reusing-variable-names |
| 93 | 93 | */ |
| 94 | - return $this->hasAttribute('readabilityDataTable') |
|
| 95 | - && $this->getAttribute('readabilityDataTable') === '1'; |
|
| 94 | + return $this->hasAttribute('readabilityDataTable') |
|
| 95 | + && $this->getAttribute('readabilityDataTable') === '1'; |
|
| 96 | 96 | // return $this->readabilityDataTable; |
| 97 | - } |
|
| 98 | - |
|
| 99 | - /** |
|
| 100 | - * @param bool $param |
|
| 101 | - */ |
|
| 102 | - public function setReadabilityDataTable($param) |
|
| 103 | - { |
|
| 104 | - // Can't be "true" because DOMDocument casts it to "1" |
|
| 105 | - $this->setAttribute('readabilityDataTable', $param ? '1' : '0'); |
|
| 97 | + } |
|
| 98 | + |
|
| 99 | + /** |
|
| 100 | + * @param bool $param |
|
| 101 | + */ |
|
| 102 | + public function setReadabilityDataTable($param) |
|
| 103 | + { |
|
| 104 | + // Can't be "true" because DOMDocument casts it to "1" |
|
| 105 | + $this->setAttribute('readabilityDataTable', $param ? '1' : '0'); |
|
| 106 | 106 | // $this->readabilityDataTable = $param; |
| 107 | - } |
|
| 108 | - |
|
| 109 | - /** |
|
| 110 | - * Initializer. Calculates the current score of the node and returns a full Readability object. |
|
| 111 | - * |
|
| 112 | - * @ TODO: I don't like the weightClasses param. How can we get the config here? |
|
| 113 | - * |
|
| 114 | - * @param $weightClasses bool Weight classes? |
|
| 115 | - * |
|
| 116 | - * @return static |
|
| 117 | - */ |
|
| 118 | - public function initializeNode($weightClasses) |
|
| 119 | - { |
|
| 120 | - if (!$this->isInitialized()) { |
|
| 121 | - $contentScore = 0; |
|
| 122 | - |
|
| 123 | - switch ($this->nodeName) { |
|
| 124 | - case 'div': |
|
| 125 | - $contentScore += 5; |
|
| 126 | - break; |
|
| 127 | - |
|
| 128 | - case 'pre': |
|
| 129 | - case 'td': |
|
| 130 | - case 'blockquote': |
|
| 131 | - $contentScore += 3; |
|
| 132 | - break; |
|
| 133 | - |
|
| 134 | - case 'address': |
|
| 135 | - case 'ol': |
|
| 136 | - case 'ul': |
|
| 137 | - case 'dl': |
|
| 138 | - case 'dd': |
|
| 139 | - case 'dt': |
|
| 140 | - case 'li': |
|
| 141 | - case 'form': |
|
| 142 | - $contentScore -= 3; |
|
| 143 | - break; |
|
| 144 | - |
|
| 145 | - case 'h1': |
|
| 146 | - case 'h2': |
|
| 147 | - case 'h3': |
|
| 148 | - case 'h4': |
|
| 149 | - case 'h5': |
|
| 150 | - case 'h6': |
|
| 151 | - case 'th': |
|
| 152 | - $contentScore -= 5; |
|
| 153 | - break; |
|
| 154 | - } |
|
| 155 | - |
|
| 156 | - $this->contentScore = $contentScore + ($weightClasses ? $this->getClassWeight() : 0); |
|
| 157 | - |
|
| 158 | - $this->initialized = true; |
|
| 159 | - } |
|
| 160 | - |
|
| 161 | - return $this; |
|
| 162 | - } |
|
| 163 | - |
|
| 164 | - /** |
|
| 165 | - * Override for native getAttribute method. Some nodes have the getAttribute method, some don't, so we need |
|
| 166 | - * to check first the existence of the attributes property. |
|
| 167 | - * |
|
| 168 | - * @param $attributeName string Attribute to retrieve |
|
| 169 | - * |
|
| 170 | - * @return string |
|
| 171 | - */ |
|
| 172 | - public function getAttribute($attributeName) |
|
| 173 | - { |
|
| 174 | - if (!is_null($this->attributes)) { |
|
| 175 | - return parent::getAttribute($attributeName); |
|
| 176 | - } |
|
| 177 | - |
|
| 178 | - return ''; |
|
| 179 | - } |
|
| 180 | - |
|
| 181 | - /** |
|
| 182 | - * Override for native hasAttribute. |
|
| 183 | - * |
|
| 184 | - * @param $attributeName |
|
| 185 | - * |
|
| 186 | - * @return bool |
|
| 187 | - * |
|
| 188 | - * @see getAttribute |
|
| 189 | - */ |
|
| 190 | - public function hasAttribute($attributeName) |
|
| 191 | - { |
|
| 192 | - if (!is_null($this->attributes)) { |
|
| 193 | - return parent::hasAttribute($attributeName); |
|
| 194 | - } |
|
| 195 | - |
|
| 196 | - return false; |
|
| 197 | - } |
|
| 198 | - |
|
| 199 | - /** |
|
| 200 | - * Get the ancestors of the current node. |
|
| 201 | - * |
|
| 202 | - * @param int|bool $maxLevel Max amount of ancestors to get. False for all of them |
|
| 203 | - * |
|
| 204 | - * @return array |
|
| 205 | - */ |
|
| 206 | - public function getNodeAncestors($maxLevel = 3) |
|
| 207 | - { |
|
| 208 | - $ancestors = []; |
|
| 209 | - $level = 0; |
|
| 210 | - |
|
| 211 | - $node = $this->parentNode; |
|
| 212 | - |
|
| 213 | - while ($node && !($node instanceof DOMDocument)) { |
|
| 214 | - $ancestors[] = $node; |
|
| 215 | - $level++; |
|
| 216 | - if ($level === $maxLevel) { |
|
| 217 | - break; |
|
| 218 | - } |
|
| 219 | - $node = $node->parentNode; |
|
| 220 | - } |
|
| 221 | - |
|
| 222 | - return $ancestors; |
|
| 223 | - } |
|
| 224 | - |
|
| 225 | - /** |
|
| 226 | - * Returns all links from the current element. |
|
| 227 | - * |
|
| 228 | - * @return array |
|
| 229 | - */ |
|
| 230 | - public function getAllLinks() |
|
| 231 | - { |
|
| 232 | - return iterator_to_array($this->getElementsByTagName('a')); |
|
| 233 | - } |
|
| 234 | - |
|
| 235 | - /** |
|
| 236 | - * Get the density of links as a percentage of the content |
|
| 237 | - * This is the amount of text that is inside a link divided by the total text in the node. |
|
| 238 | - * |
|
| 239 | - * @return int |
|
| 240 | - */ |
|
| 241 | - public function getLinkDensity() |
|
| 242 | - { |
|
| 243 | - $linkLength = 0; |
|
| 244 | - $textLength = mb_strlen($this->getTextContent(true)); |
|
| 245 | - |
|
| 246 | - if (!$textLength) { |
|
| 247 | - return 0; |
|
| 248 | - } |
|
| 249 | - |
|
| 250 | - $links = $this->getAllLinks(); |
|
| 251 | - |
|
| 252 | - if ($links) { |
|
| 253 | - /** @var DOMElement $link */ |
|
| 254 | - foreach ($links as $link) { |
|
| 255 | - $linkLength += mb_strlen($link->getTextContent(true)); |
|
| 256 | - } |
|
| 257 | - } |
|
| 258 | - |
|
| 259 | - return $linkLength / $textLength; |
|
| 260 | - } |
|
| 261 | - |
|
| 262 | - /** |
|
| 263 | - * Calculates the weight of the class/id of the current element. |
|
| 264 | - * |
|
| 265 | - * @return int |
|
| 266 | - */ |
|
| 267 | - public function getClassWeight() |
|
| 268 | - { |
|
| 269 | - $weight = 0; |
|
| 270 | - |
|
| 271 | - // Look for a special classname |
|
| 272 | - $class = $this->getAttribute('class'); |
|
| 273 | - if (trim($class)) { |
|
| 274 | - if (preg_match(NodeUtility::$regexps['negative'], $class)) { |
|
| 275 | - $weight -= 25; |
|
| 276 | - } |
|
| 277 | - |
|
| 278 | - if (preg_match(NodeUtility::$regexps['positive'], $class)) { |
|
| 279 | - $weight += 25; |
|
| 280 | - } |
|
| 281 | - } |
|
| 282 | - |
|
| 283 | - // Look for a special ID |
|
| 284 | - $id = $this->getAttribute('id'); |
|
| 285 | - if (trim($id)) { |
|
| 286 | - if (preg_match(NodeUtility::$regexps['negative'], $id)) { |
|
| 287 | - $weight -= 25; |
|
| 288 | - } |
|
| 289 | - |
|
| 290 | - if (preg_match(NodeUtility::$regexps['positive'], $id)) { |
|
| 291 | - $weight += 25; |
|
| 292 | - } |
|
| 293 | - } |
|
| 294 | - |
|
| 295 | - return $weight; |
|
| 296 | - } |
|
| 297 | - |
|
| 298 | - /** |
|
| 299 | - * Returns the full text of the node. |
|
| 300 | - * |
|
| 301 | - * @param bool $normalize Normalize white space? |
|
| 302 | - * |
|
| 303 | - * @return string |
|
| 304 | - */ |
|
| 305 | - public function getTextContent($normalize = false) |
|
| 306 | - { |
|
| 307 | - $nodeValue = $this->nodeValue; |
|
| 308 | - if ($normalize) { |
|
| 309 | - $nodeValue = trim(preg_replace('/\s{2,}/', ' ', $nodeValue)); |
|
| 310 | - } |
|
| 311 | - |
|
| 312 | - return $nodeValue; |
|
| 313 | - } |
|
| 314 | - |
|
| 315 | - /** |
|
| 316 | - * Returns the children of the current node. |
|
| 317 | - * |
|
| 318 | - * @param bool $filterEmptyDOMText Filter empty DOMText nodes? |
|
| 319 | - * |
|
| 320 | - * @deprecated Use NodeUtility::filterTextNodes, function will be removed in version 3.0 |
|
| 321 | - * |
|
| 322 | - * @return array |
|
| 323 | - */ |
|
| 324 | - public function getChildren($filterEmptyDOMText = false) |
|
| 325 | - { |
|
| 326 | - @trigger_error('getChildren was replaced with NodeUtility::filterTextNodes and will be removed in version 3.0', E_USER_DEPRECATED); |
|
| 327 | - |
|
| 328 | - $ret = iterator_to_array($this->childNodes); |
|
| 329 | - if ($filterEmptyDOMText) { |
|
| 330 | - // Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number |
|
| 331 | - $ret = array_values(array_filter($ret, function ($node) { |
|
| 332 | - return $node->nodeName !== '#text' || mb_strlen(trim($node->nodeValue)); |
|
| 333 | - })); |
|
| 334 | - } |
|
| 335 | - |
|
| 336 | - return $ret; |
|
| 337 | - } |
|
| 338 | - |
|
| 339 | - /** |
|
| 340 | - * Return an array indicating how many rows and columns this table has. |
|
| 341 | - * |
|
| 342 | - * @return array |
|
| 343 | - */ |
|
| 344 | - public function getRowAndColumnCount() |
|
| 345 | - { |
|
| 346 | - $rows = $columns = 0; |
|
| 347 | - $trs = $this->getElementsByTagName('tr'); |
|
| 348 | - foreach ($trs as $tr) { |
|
| 349 | - /** @var \DOMElement $tr */ |
|
| 350 | - $rowspan = $tr->getAttribute('rowspan'); |
|
| 351 | - $rows += ($rowspan || 1); |
|
| 352 | - |
|
| 353 | - // Now look for column-related info |
|
| 354 | - $columnsInThisRow = 0; |
|
| 355 | - $cells = $tr->getElementsByTagName('td'); |
|
| 356 | - foreach ($cells as $cell) { |
|
| 357 | - /** @var \DOMElement $cell */ |
|
| 358 | - $colspan = $cell->getAttribute('colspan'); |
|
| 359 | - $columnsInThisRow += ($colspan || 1); |
|
| 360 | - } |
|
| 361 | - $columns = max($columns, $columnsInThisRow); |
|
| 362 | - } |
|
| 363 | - |
|
| 364 | - return ['rows' => $rows, 'columns' => $columns]; |
|
| 365 | - } |
|
| 366 | - |
|
| 367 | - /** |
|
| 368 | - * Creates a new node based on the text content of the original node. |
|
| 369 | - * |
|
| 370 | - * @param $originalNode DOMNode |
|
| 371 | - * @param $tagName string |
|
| 372 | - * |
|
| 373 | - * @return DOMElement |
|
| 374 | - */ |
|
| 375 | - public function createNode($originalNode, $tagName) |
|
| 376 | - { |
|
| 377 | - $text = $originalNode->getTextContent(); |
|
| 378 | - $newNode = $originalNode->ownerDocument->createElement($tagName, $text); |
|
| 379 | - |
|
| 380 | - return $newNode; |
|
| 381 | - } |
|
| 382 | - |
|
| 383 | - /** |
|
| 384 | - * Check if a given node has one of its ancestor tag name matching the |
|
| 385 | - * provided one. |
|
| 386 | - * |
|
| 387 | - * @param string $tagName |
|
| 388 | - * @param int $maxDepth |
|
| 389 | - * @param callable $filterFn |
|
| 390 | - * |
|
| 391 | - * @return bool |
|
| 392 | - */ |
|
| 393 | - public function hasAncestorTag($tagName, $maxDepth = 3, callable $filterFn = null) |
|
| 394 | - { |
|
| 395 | - $depth = 0; |
|
| 396 | - $node = $this; |
|
| 397 | - |
|
| 398 | - while ($node->parentNode) { |
|
| 399 | - if ($maxDepth > 0 && $depth > $maxDepth) { |
|
| 400 | - return false; |
|
| 401 | - } |
|
| 402 | - |
|
| 403 | - if ($node->parentNode->nodeName === $tagName && (!$filterFn || $filterFn($node->parentNode))) { |
|
| 404 | - return true; |
|
| 405 | - } |
|
| 406 | - |
|
| 407 | - $node = $node->parentNode; |
|
| 408 | - $depth++; |
|
| 409 | - } |
|
| 410 | - |
|
| 411 | - return false; |
|
| 412 | - } |
|
| 413 | - |
|
| 414 | - /** |
|
| 415 | - * Check if this node has only whitespace and a single element with given tag |
|
| 416 | - * or if it contains no element with given tag or more than 1 element. |
|
| 417 | - * |
|
| 418 | - * @param $tag string Name of tag |
|
| 419 | - * |
|
| 420 | - * @return bool |
|
| 421 | - */ |
|
| 422 | - public function hasSingleTagInsideElement($tag) |
|
| 423 | - { |
|
| 424 | - // There should be exactly 1 element child with given tag |
|
| 425 | - if (count($children = NodeUtility::filterTextNodes($this->childNodes)) !== 1 || $children->item(0)->nodeName !== $tag) { |
|
| 426 | - return false; |
|
| 427 | - } |
|
| 428 | - |
|
| 429 | - // And there should be no text nodes with real content |
|
| 430 | - return array_reduce(iterator_to_array($children), function ($carry, $child) { |
|
| 431 | - if (!$carry === false) { |
|
| 432 | - return false; |
|
| 433 | - } |
|
| 434 | - |
|
| 435 | - /* @var DOMNode $child */ |
|
| 436 | - return !($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent())); |
|
| 437 | - }); |
|
| 438 | - } |
|
| 439 | - |
|
| 440 | - /** |
|
| 441 | - * Check if the current element has a single child block element. |
|
| 442 | - * Block elements are the ones defined in the divToPElements array. |
|
| 443 | - * |
|
| 444 | - * @return bool |
|
| 445 | - */ |
|
| 446 | - public function hasSingleChildBlockElement() |
|
| 447 | - { |
|
| 448 | - $result = false; |
|
| 449 | - if ($this->hasChildNodes()) { |
|
| 450 | - foreach ($this->childNodes as $child) { |
|
| 451 | - if (in_array($child->nodeName, $this->divToPElements)) { |
|
| 452 | - $result = true; |
|
| 453 | - } else { |
|
| 454 | - // If any of the hasSingleChildBlockElement calls return true, return true then. |
|
| 455 | - /** @var $child DOMElement */ |
|
| 456 | - $result = ($result || $child->hasSingleChildBlockElement()); |
|
| 457 | - } |
|
| 458 | - } |
|
| 459 | - } |
|
| 460 | - |
|
| 461 | - return $result; |
|
| 462 | - } |
|
| 463 | - |
|
| 464 | - /** |
|
| 465 | - * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace. |
|
| 466 | - * |
|
| 467 | - * @return bool |
|
| 468 | - */ |
|
| 469 | - public function isElementWithoutContent() |
|
| 470 | - { |
|
| 471 | - return $this instanceof DOMElement && |
|
| 472 | - mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 && |
|
| 473 | - ($this->childNodes->length === 0 || |
|
| 474 | - $this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length |
|
| 475 | - /* |
|
| 107 | + } |
|
| 108 | + |
|
| 109 | + /** |
|
| 110 | + * Initializer. Calculates the current score of the node and returns a full Readability object. |
|
| 111 | + * |
|
| 112 | + * @ TODO: I don't like the weightClasses param. How can we get the config here? |
|
| 113 | + * |
|
| 114 | + * @param $weightClasses bool Weight classes? |
|
| 115 | + * |
|
| 116 | + * @return static |
|
| 117 | + */ |
|
| 118 | + public function initializeNode($weightClasses) |
|
| 119 | + { |
|
| 120 | + if (!$this->isInitialized()) { |
|
| 121 | + $contentScore = 0; |
|
| 122 | + |
|
| 123 | + switch ($this->nodeName) { |
|
| 124 | + case 'div': |
|
| 125 | + $contentScore += 5; |
|
| 126 | + break; |
|
| 127 | + |
|
| 128 | + case 'pre': |
|
| 129 | + case 'td': |
|
| 130 | + case 'blockquote': |
|
| 131 | + $contentScore += 3; |
|
| 132 | + break; |
|
| 133 | + |
|
| 134 | + case 'address': |
|
| 135 | + case 'ol': |
|
| 136 | + case 'ul': |
|
| 137 | + case 'dl': |
|
| 138 | + case 'dd': |
|
| 139 | + case 'dt': |
|
| 140 | + case 'li': |
|
| 141 | + case 'form': |
|
| 142 | + $contentScore -= 3; |
|
| 143 | + break; |
|
| 144 | + |
|
| 145 | + case 'h1': |
|
| 146 | + case 'h2': |
|
| 147 | + case 'h3': |
|
| 148 | + case 'h4': |
|
| 149 | + case 'h5': |
|
| 150 | + case 'h6': |
|
| 151 | + case 'th': |
|
| 152 | + $contentScore -= 5; |
|
| 153 | + break; |
|
| 154 | + } |
|
| 155 | + |
|
| 156 | + $this->contentScore = $contentScore + ($weightClasses ? $this->getClassWeight() : 0); |
|
| 157 | + |
|
| 158 | + $this->initialized = true; |
|
| 159 | + } |
|
| 160 | + |
|
| 161 | + return $this; |
|
| 162 | + } |
|
| 163 | + |
|
| 164 | + /** |
|
| 165 | + * Override for native getAttribute method. Some nodes have the getAttribute method, some don't, so we need |
|
| 166 | + * to check first the existence of the attributes property. |
|
| 167 | + * |
|
| 168 | + * @param $attributeName string Attribute to retrieve |
|
| 169 | + * |
|
| 170 | + * @return string |
|
| 171 | + */ |
|
| 172 | + public function getAttribute($attributeName) |
|
| 173 | + { |
|
| 174 | + if (!is_null($this->attributes)) { |
|
| 175 | + return parent::getAttribute($attributeName); |
|
| 176 | + } |
|
| 177 | + |
|
| 178 | + return ''; |
|
| 179 | + } |
|
| 180 | + |
|
| 181 | + /** |
|
| 182 | + * Override for native hasAttribute. |
|
| 183 | + * |
|
| 184 | + * @param $attributeName |
|
| 185 | + * |
|
| 186 | + * @return bool |
|
| 187 | + * |
|
| 188 | + * @see getAttribute |
|
| 189 | + */ |
|
| 190 | + public function hasAttribute($attributeName) |
|
| 191 | + { |
|
| 192 | + if (!is_null($this->attributes)) { |
|
| 193 | + return parent::hasAttribute($attributeName); |
|
| 194 | + } |
|
| 195 | + |
|
| 196 | + return false; |
|
| 197 | + } |
|
| 198 | + |
|
| 199 | + /** |
|
| 200 | + * Get the ancestors of the current node. |
|
| 201 | + * |
|
| 202 | + * @param int|bool $maxLevel Max amount of ancestors to get. False for all of them |
|
| 203 | + * |
|
| 204 | + * @return array |
|
| 205 | + */ |
|
| 206 | + public function getNodeAncestors($maxLevel = 3) |
|
| 207 | + { |
|
| 208 | + $ancestors = []; |
|
| 209 | + $level = 0; |
|
| 210 | + |
|
| 211 | + $node = $this->parentNode; |
|
| 212 | + |
|
| 213 | + while ($node && !($node instanceof DOMDocument)) { |
|
| 214 | + $ancestors[] = $node; |
|
| 215 | + $level++; |
|
| 216 | + if ($level === $maxLevel) { |
|
| 217 | + break; |
|
| 218 | + } |
|
| 219 | + $node = $node->parentNode; |
|
| 220 | + } |
|
| 221 | + |
|
| 222 | + return $ancestors; |
|
| 223 | + } |
|
| 224 | + |
|
| 225 | + /** |
|
| 226 | + * Returns all links from the current element. |
|
| 227 | + * |
|
| 228 | + * @return array |
|
| 229 | + */ |
|
| 230 | + public function getAllLinks() |
|
| 231 | + { |
|
| 232 | + return iterator_to_array($this->getElementsByTagName('a')); |
|
| 233 | + } |
|
| 234 | + |
|
| 235 | + /** |
|
| 236 | + * Get the density of links as a percentage of the content |
|
| 237 | + * This is the amount of text that is inside a link divided by the total text in the node. |
|
| 238 | + * |
|
| 239 | + * @return int |
|
| 240 | + */ |
|
| 241 | + public function getLinkDensity() |
|
| 242 | + { |
|
| 243 | + $linkLength = 0; |
|
| 244 | + $textLength = mb_strlen($this->getTextContent(true)); |
|
| 245 | + |
|
| 246 | + if (!$textLength) { |
|
| 247 | + return 0; |
|
| 248 | + } |
|
| 249 | + |
|
| 250 | + $links = $this->getAllLinks(); |
|
| 251 | + |
|
| 252 | + if ($links) { |
|
| 253 | + /** @var DOMElement $link */ |
|
| 254 | + foreach ($links as $link) { |
|
| 255 | + $linkLength += mb_strlen($link->getTextContent(true)); |
|
| 256 | + } |
|
| 257 | + } |
|
| 258 | + |
|
| 259 | + return $linkLength / $textLength; |
|
| 260 | + } |
|
| 261 | + |
|
| 262 | + /** |
|
| 263 | + * Calculates the weight of the class/id of the current element. |
|
| 264 | + * |
|
| 265 | + * @return int |
|
| 266 | + */ |
|
| 267 | + public function getClassWeight() |
|
| 268 | + { |
|
| 269 | + $weight = 0; |
|
| 270 | + |
|
| 271 | + // Look for a special classname |
|
| 272 | + $class = $this->getAttribute('class'); |
|
| 273 | + if (trim($class)) { |
|
| 274 | + if (preg_match(NodeUtility::$regexps['negative'], $class)) { |
|
| 275 | + $weight -= 25; |
|
| 276 | + } |
|
| 277 | + |
|
| 278 | + if (preg_match(NodeUtility::$regexps['positive'], $class)) { |
|
| 279 | + $weight += 25; |
|
| 280 | + } |
|
| 281 | + } |
|
| 282 | + |
|
| 283 | + // Look for a special ID |
|
| 284 | + $id = $this->getAttribute('id'); |
|
| 285 | + if (trim($id)) { |
|
| 286 | + if (preg_match(NodeUtility::$regexps['negative'], $id)) { |
|
| 287 | + $weight -= 25; |
|
| 288 | + } |
|
| 289 | + |
|
| 290 | + if (preg_match(NodeUtility::$regexps['positive'], $id)) { |
|
| 291 | + $weight += 25; |
|
| 292 | + } |
|
| 293 | + } |
|
| 294 | + |
|
| 295 | + return $weight; |
|
| 296 | + } |
|
| 297 | + |
|
| 298 | + /** |
|
| 299 | + * Returns the full text of the node. |
|
| 300 | + * |
|
| 301 | + * @param bool $normalize Normalize white space? |
|
| 302 | + * |
|
| 303 | + * @return string |
|
| 304 | + */ |
|
| 305 | + public function getTextContent($normalize = false) |
|
| 306 | + { |
|
| 307 | + $nodeValue = $this->nodeValue; |
|
| 308 | + if ($normalize) { |
|
| 309 | + $nodeValue = trim(preg_replace('/\s{2,}/', ' ', $nodeValue)); |
|
| 310 | + } |
|
| 311 | + |
|
| 312 | + return $nodeValue; |
|
| 313 | + } |
|
| 314 | + |
|
| 315 | + /** |
|
| 316 | + * Returns the children of the current node. |
|
| 317 | + * |
|
| 318 | + * @param bool $filterEmptyDOMText Filter empty DOMText nodes? |
|
| 319 | + * |
|
| 320 | + * @deprecated Use NodeUtility::filterTextNodes, function will be removed in version 3.0 |
|
| 321 | + * |
|
| 322 | + * @return array |
|
| 323 | + */ |
|
| 324 | + public function getChildren($filterEmptyDOMText = false) |
|
| 325 | + { |
|
| 326 | + @trigger_error('getChildren was replaced with NodeUtility::filterTextNodes and will be removed in version 3.0', E_USER_DEPRECATED); |
|
| 327 | + |
|
| 328 | + $ret = iterator_to_array($this->childNodes); |
|
| 329 | + if ($filterEmptyDOMText) { |
|
| 330 | + // Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number |
|
| 331 | + $ret = array_values(array_filter($ret, function ($node) { |
|
| 332 | + return $node->nodeName !== '#text' || mb_strlen(trim($node->nodeValue)); |
|
| 333 | + })); |
|
| 334 | + } |
|
| 335 | + |
|
| 336 | + return $ret; |
|
| 337 | + } |
|
| 338 | + |
|
| 339 | + /** |
|
| 340 | + * Return an array indicating how many rows and columns this table has. |
|
| 341 | + * |
|
| 342 | + * @return array |
|
| 343 | + */ |
|
| 344 | + public function getRowAndColumnCount() |
|
| 345 | + { |
|
| 346 | + $rows = $columns = 0; |
|
| 347 | + $trs = $this->getElementsByTagName('tr'); |
|
| 348 | + foreach ($trs as $tr) { |
|
| 349 | + /** @var \DOMElement $tr */ |
|
| 350 | + $rowspan = $tr->getAttribute('rowspan'); |
|
| 351 | + $rows += ($rowspan || 1); |
|
| 352 | + |
|
| 353 | + // Now look for column-related info |
|
| 354 | + $columnsInThisRow = 0; |
|
| 355 | + $cells = $tr->getElementsByTagName('td'); |
|
| 356 | + foreach ($cells as $cell) { |
|
| 357 | + /** @var \DOMElement $cell */ |
|
| 358 | + $colspan = $cell->getAttribute('colspan'); |
|
| 359 | + $columnsInThisRow += ($colspan || 1); |
|
| 360 | + } |
|
| 361 | + $columns = max($columns, $columnsInThisRow); |
|
| 362 | + } |
|
| 363 | + |
|
| 364 | + return ['rows' => $rows, 'columns' => $columns]; |
|
| 365 | + } |
|
| 366 | + |
|
| 367 | + /** |
|
| 368 | + * Creates a new node based on the text content of the original node. |
|
| 369 | + * |
|
| 370 | + * @param $originalNode DOMNode |
|
| 371 | + * @param $tagName string |
|
| 372 | + * |
|
| 373 | + * @return DOMElement |
|
| 374 | + */ |
|
| 375 | + public function createNode($originalNode, $tagName) |
|
| 376 | + { |
|
| 377 | + $text = $originalNode->getTextContent(); |
|
| 378 | + $newNode = $originalNode->ownerDocument->createElement($tagName, $text); |
|
| 379 | + |
|
| 380 | + return $newNode; |
|
| 381 | + } |
|
| 382 | + |
|
| 383 | + /** |
|
| 384 | + * Check if a given node has one of its ancestor tag name matching the |
|
| 385 | + * provided one. |
|
| 386 | + * |
|
| 387 | + * @param string $tagName |
|
| 388 | + * @param int $maxDepth |
|
| 389 | + * @param callable $filterFn |
|
| 390 | + * |
|
| 391 | + * @return bool |
|
| 392 | + */ |
|
| 393 | + public function hasAncestorTag($tagName, $maxDepth = 3, callable $filterFn = null) |
|
| 394 | + { |
|
| 395 | + $depth = 0; |
|
| 396 | + $node = $this; |
|
| 397 | + |
|
| 398 | + while ($node->parentNode) { |
|
| 399 | + if ($maxDepth > 0 && $depth > $maxDepth) { |
|
| 400 | + return false; |
|
| 401 | + } |
|
| 402 | + |
|
| 403 | + if ($node->parentNode->nodeName === $tagName && (!$filterFn || $filterFn($node->parentNode))) { |
|
| 404 | + return true; |
|
| 405 | + } |
|
| 406 | + |
|
| 407 | + $node = $node->parentNode; |
|
| 408 | + $depth++; |
|
| 409 | + } |
|
| 410 | + |
|
| 411 | + return false; |
|
| 412 | + } |
|
| 413 | + |
|
| 414 | + /** |
|
| 415 | + * Check if this node has only whitespace and a single element with given tag |
|
| 416 | + * or if it contains no element with given tag or more than 1 element. |
|
| 417 | + * |
|
| 418 | + * @param $tag string Name of tag |
|
| 419 | + * |
|
| 420 | + * @return bool |
|
| 421 | + */ |
|
| 422 | + public function hasSingleTagInsideElement($tag) |
|
| 423 | + { |
|
| 424 | + // There should be exactly 1 element child with given tag |
|
| 425 | + if (count($children = NodeUtility::filterTextNodes($this->childNodes)) !== 1 || $children->item(0)->nodeName !== $tag) { |
|
| 426 | + return false; |
|
| 427 | + } |
|
| 428 | + |
|
| 429 | + // And there should be no text nodes with real content |
|
| 430 | + return array_reduce(iterator_to_array($children), function ($carry, $child) { |
|
| 431 | + if (!$carry === false) { |
|
| 432 | + return false; |
|
| 433 | + } |
|
| 434 | + |
|
| 435 | + /* @var DOMNode $child */ |
|
| 436 | + return !($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent())); |
|
| 437 | + }); |
|
| 438 | + } |
|
| 439 | + |
|
| 440 | + /** |
|
| 441 | + * Check if the current element has a single child block element. |
|
| 442 | + * Block elements are the ones defined in the divToPElements array. |
|
| 443 | + * |
|
| 444 | + * @return bool |
|
| 445 | + */ |
|
| 446 | + public function hasSingleChildBlockElement() |
|
| 447 | + { |
|
| 448 | + $result = false; |
|
| 449 | + if ($this->hasChildNodes()) { |
|
| 450 | + foreach ($this->childNodes as $child) { |
|
| 451 | + if (in_array($child->nodeName, $this->divToPElements)) { |
|
| 452 | + $result = true; |
|
| 453 | + } else { |
|
| 454 | + // If any of the hasSingleChildBlockElement calls return true, return true then. |
|
| 455 | + /** @var $child DOMElement */ |
|
| 456 | + $result = ($result || $child->hasSingleChildBlockElement()); |
|
| 457 | + } |
|
| 458 | + } |
|
| 459 | + } |
|
| 460 | + |
|
| 461 | + return $result; |
|
| 462 | + } |
|
| 463 | + |
|
| 464 | + /** |
|
| 465 | + * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace. |
|
| 466 | + * |
|
| 467 | + * @return bool |
|
| 468 | + */ |
|
| 469 | + public function isElementWithoutContent() |
|
| 470 | + { |
|
| 471 | + return $this instanceof DOMElement && |
|
| 472 | + mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 && |
|
| 473 | + ($this->childNodes->length === 0 || |
|
| 474 | + $this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length |
|
| 475 | + /* |
|
| 476 | 476 | * Special PHP DOMDocument case: We also need to count how many DOMText we have inside the node. |
| 477 | 477 | * If there's an empty tag with an space inside and a BR (for example "<p> <br/></p>) counting only BRs and |
| 478 | 478 | * HRs will will say that the example has 2 nodes, instead of one. This happens because in DOMDocument, |
@@ -480,108 +480,108 @@ discard block |
||
| 480 | 480 | * are dealing with (And at this point we know they are empty or are just whitespace, because of the |
| 481 | 481 | * mb_strlen in this chain of checks). |
| 482 | 482 | */ |
| 483 | - + count(array_filter(iterator_to_array($this->childNodes), function ($child) { |
|
| 484 | - return $child instanceof DOMText; |
|
| 485 | - })) |
|
| 486 | - |
|
| 487 | - ); |
|
| 488 | - } |
|
| 489 | - |
|
| 490 | - /** |
|
| 491 | - * Determine if a node qualifies as phrasing content. |
|
| 492 | - * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content. |
|
| 493 | - * |
|
| 494 | - * @return bool |
|
| 495 | - */ |
|
| 496 | - public function isPhrasingContent() |
|
| 497 | - { |
|
| 498 | - return $this->nodeType === XML_TEXT_NODE || in_array($this->nodeName, $this->phrasing_elems) !== false || |
|
| 499 | - (!is_null($this->childNodes) && |
|
| 500 | - ($this->nodeName === 'a' || $this->nodeName === 'del' || $this->nodeName === 'ins') && |
|
| 501 | - array_reduce(iterator_to_array($this->childNodes), function ($carry, $node) { |
|
| 502 | - return $node->isPhrasingContent() && $carry; |
|
| 503 | - }, true) |
|
| 504 | - ); |
|
| 505 | - } |
|
| 506 | - |
|
| 507 | - /** |
|
| 508 | - * In the original JS project they check if the node has the style display=none, which unfortunately |
|
| 509 | - * in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none". |
|
| 510 | - * |
|
| 511 | - * Might be a good idea to check for classes or other attributes like 'aria-hidden' |
|
| 512 | - * |
|
| 513 | - * @return bool |
|
| 514 | - */ |
|
| 515 | - public function isProbablyVisible() |
|
| 516 | - { |
|
| 517 | - return !preg_match('/display:( )?none/', $this->getAttribute('style')) && !$this->hasAttribute('hidden'); |
|
| 518 | - } |
|
| 519 | - |
|
| 520 | - /** |
|
| 521 | - * @return bool |
|
| 522 | - */ |
|
| 523 | - public function isWhitespace() |
|
| 524 | - { |
|
| 525 | - return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) || |
|
| 526 | - ($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br'); |
|
| 527 | - } |
|
| 528 | - |
|
| 529 | - /** |
|
| 530 | - * This is a hack that overcomes the issue of node shifting when scanning and removing nodes. |
|
| 531 | - * |
|
| 532 | - * In the JS version of getElementsByTagName, if you remove a node it will not appear during the |
|
| 533 | - * foreach. This does not happen in PHP DOMDocument, because if you remove a node, it will still appear but as an |
|
| 534 | - * orphan node and will give an exception if you try to do anything with it. |
|
| 535 | - * |
|
| 536 | - * Shifting also occurs when converting parent nodes (like a P to a DIV), which in that case the found nodes are |
|
| 537 | - * removed from the foreach "pool" but the internal index of the foreach is not aware and skips over nodes that |
|
| 538 | - * never looped over. (index is at position 5, 2 nodes are removed, next one should be node 3, but the foreach tries |
|
| 539 | - * to access node 6) |
|
| 540 | - * |
|
| 541 | - * This function solves this by searching for the nodes on every loop and keeping track of the count differences. |
|
| 542 | - * Because on every loop we call getElementsByTagName again, this could cause a performance impact and should be |
|
| 543 | - * used only when the results of the search are going to be used to remove the nodes. |
|
| 544 | - * |
|
| 545 | - * @param string $tag |
|
| 546 | - * |
|
| 547 | - * @return \Generator |
|
| 548 | - */ |
|
| 549 | - public function shiftingAwareGetElementsByTagName($tag) |
|
| 550 | - { |
|
| 551 | - /** @var $nodes DOMNodeList */ |
|
| 552 | - $nodes = $this->getElementsByTagName($tag); |
|
| 553 | - $count = $nodes->length; |
|
| 554 | - |
|
| 555 | - for ($i = 0; $i < $count; $i = max(++$i, 0)) { |
|
| 556 | - yield $nodes->item($i); |
|
| 557 | - |
|
| 558 | - // Search for all the nodes again |
|
| 559 | - $nodes = $this->getElementsByTagName($tag); |
|
| 560 | - |
|
| 561 | - // Subtract the amount of nodes removed from the current index |
|
| 562 | - $i -= $count - $nodes->length; |
|
| 563 | - |
|
| 564 | - // Subtract the amount of nodes removed from the current count |
|
| 565 | - $count -= ($count - $nodes->length); |
|
| 566 | - } |
|
| 567 | - } |
|
| 568 | - |
|
| 569 | - /** |
|
| 570 | - * Mimics JS's firstElementChild property. PHP only has firstChild which could be any type of DOMNode. Use this |
|
| 571 | - * function to get the first one that is an DOMElement node. |
|
| 572 | - * |
|
| 573 | - * @return \DOMElement|null |
|
| 574 | - */ |
|
| 575 | - public function getFirstElementChild() |
|
| 576 | - { |
|
| 577 | - if ($this->childNodes instanceof \Traversable) { |
|
| 578 | - foreach ($this->childNodes as $node) { |
|
| 579 | - if ($node instanceof \DOMElement) { |
|
| 580 | - return $node; |
|
| 581 | - } |
|
| 582 | - } |
|
| 583 | - } |
|
| 584 | - |
|
| 585 | - return null; |
|
| 586 | - } |
|
| 483 | + + count(array_filter(iterator_to_array($this->childNodes), function ($child) { |
|
| 484 | + return $child instanceof DOMText; |
|
| 485 | + })) |
|
| 486 | + |
|
| 487 | + ); |
|
| 488 | + } |
|
| 489 | + |
|
| 490 | + /** |
|
| 491 | + * Determine if a node qualifies as phrasing content. |
|
| 492 | + * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content. |
|
| 493 | + * |
|
| 494 | + * @return bool |
|
| 495 | + */ |
|
| 496 | + public function isPhrasingContent() |
|
| 497 | + { |
|
| 498 | + return $this->nodeType === XML_TEXT_NODE || in_array($this->nodeName, $this->phrasing_elems) !== false || |
|
| 499 | + (!is_null($this->childNodes) && |
|
| 500 | + ($this->nodeName === 'a' || $this->nodeName === 'del' || $this->nodeName === 'ins') && |
|
| 501 | + array_reduce(iterator_to_array($this->childNodes), function ($carry, $node) { |
|
| 502 | + return $node->isPhrasingContent() && $carry; |
|
| 503 | + }, true) |
|
| 504 | + ); |
|
| 505 | + } |
|
| 506 | + |
|
| 507 | + /** |
|
| 508 | + * In the original JS project they check if the node has the style display=none, which unfortunately |
|
| 509 | + * in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none". |
|
| 510 | + * |
|
| 511 | + * Might be a good idea to check for classes or other attributes like 'aria-hidden' |
|
| 512 | + * |
|
| 513 | + * @return bool |
|
| 514 | + */ |
|
| 515 | + public function isProbablyVisible() |
|
| 516 | + { |
|
| 517 | + return !preg_match('/display:( )?none/', $this->getAttribute('style')) && !$this->hasAttribute('hidden'); |
|
| 518 | + } |
|
| 519 | + |
|
| 520 | + /** |
|
| 521 | + * @return bool |
|
| 522 | + */ |
|
| 523 | + public function isWhitespace() |
|
| 524 | + { |
|
| 525 | + return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) || |
|
| 526 | + ($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br'); |
|
| 527 | + } |
|
| 528 | + |
|
| 529 | + /** |
|
| 530 | + * This is a hack that overcomes the issue of node shifting when scanning and removing nodes. |
|
| 531 | + * |
|
| 532 | + * In the JS version of getElementsByTagName, if you remove a node it will not appear during the |
|
| 533 | + * foreach. This does not happen in PHP DOMDocument, because if you remove a node, it will still appear but as an |
|
| 534 | + * orphan node and will give an exception if you try to do anything with it. |
|
| 535 | + * |
|
| 536 | + * Shifting also occurs when converting parent nodes (like a P to a DIV), which in that case the found nodes are |
|
| 537 | + * removed from the foreach "pool" but the internal index of the foreach is not aware and skips over nodes that |
|
| 538 | + * never looped over. (index is at position 5, 2 nodes are removed, next one should be node 3, but the foreach tries |
|
| 539 | + * to access node 6) |
|
| 540 | + * |
|
| 541 | + * This function solves this by searching for the nodes on every loop and keeping track of the count differences. |
|
| 542 | + * Because on every loop we call getElementsByTagName again, this could cause a performance impact and should be |
|
| 543 | + * used only when the results of the search are going to be used to remove the nodes. |
|
| 544 | + * |
|
| 545 | + * @param string $tag |
|
| 546 | + * |
|
| 547 | + * @return \Generator |
|
| 548 | + */ |
|
| 549 | + public function shiftingAwareGetElementsByTagName($tag) |
|
| 550 | + { |
|
| 551 | + /** @var $nodes DOMNodeList */ |
|
| 552 | + $nodes = $this->getElementsByTagName($tag); |
|
| 553 | + $count = $nodes->length; |
|
| 554 | + |
|
| 555 | + for ($i = 0; $i < $count; $i = max(++$i, 0)) { |
|
| 556 | + yield $nodes->item($i); |
|
| 557 | + |
|
| 558 | + // Search for all the nodes again |
|
| 559 | + $nodes = $this->getElementsByTagName($tag); |
|
| 560 | + |
|
| 561 | + // Subtract the amount of nodes removed from the current index |
|
| 562 | + $i -= $count - $nodes->length; |
|
| 563 | + |
|
| 564 | + // Subtract the amount of nodes removed from the current count |
|
| 565 | + $count -= ($count - $nodes->length); |
|
| 566 | + } |
|
| 567 | + } |
|
| 568 | + |
|
| 569 | + /** |
|
| 570 | + * Mimics JS's firstElementChild property. PHP only has firstChild which could be any type of DOMNode. Use this |
|
| 571 | + * function to get the first one that is an DOMElement node. |
|
| 572 | + * |
|
| 573 | + * @return \DOMElement|null |
|
| 574 | + */ |
|
| 575 | + public function getFirstElementChild() |
|
| 576 | + { |
|
| 577 | + if ($this->childNodes instanceof \Traversable) { |
|
| 578 | + foreach ($this->childNodes as $node) { |
|
| 579 | + if ($node instanceof \DOMElement) { |
|
| 580 | + return $node; |
|
| 581 | + } |
|
| 582 | + } |
|
| 583 | + } |
|
| 584 | + |
|
| 585 | + return null; |
|
| 586 | + } |
|
| 587 | 587 | } |