@@ -6,5 +6,5 @@ |
||
6 | 6 | |
7 | 7 | class DOMEntityReference extends \DOMEntityReference |
8 | 8 | { |
9 | - use NodeTrait; |
|
9 | + use NodeTrait; |
|
10 | 10 | } |
@@ -6,5 +6,5 @@ |
||
6 | 6 | |
7 | 7 | class DOMNotation extends \DOMNotation |
8 | 8 | { |
9 | - use NodeTrait; |
|
9 | + use NodeTrait; |
|
10 | 10 | } |
@@ -6,5 +6,5 @@ |
||
6 | 6 | |
7 | 7 | class DOMComment extends \DOMComment |
8 | 8 | { |
9 | - use NodeTrait; |
|
9 | + use NodeTrait; |
|
10 | 10 | } |
@@ -6,5 +6,5 @@ |
||
6 | 6 | |
7 | 7 | class DOMDocumentFragment extends \DOMDocumentFragment |
8 | 8 | { |
9 | - use NodeTrait; |
|
9 | + use NodeTrait; |
|
10 | 10 | } |
@@ -14,69 +14,69 @@ |
||
14 | 14 | */ |
15 | 15 | class DOMNodeList implements \Countable, \IteratorAggregate |
16 | 16 | { |
17 | - /** |
|
18 | - * @var array |
|
19 | - */ |
|
20 | - protected $items = []; |
|
17 | + /** |
|
18 | + * @var array |
|
19 | + */ |
|
20 | + protected $items = []; |
|
21 | 21 | |
22 | - /** |
|
23 | - * @var int |
|
24 | - */ |
|
25 | - protected $length = 0; |
|
22 | + /** |
|
23 | + * @var int |
|
24 | + */ |
|
25 | + protected $length = 0; |
|
26 | 26 | |
27 | - /** |
|
28 | - * To allow access to length in the same way that DOMNodeList allows. |
|
29 | - * |
|
30 | - * {@inheritdoc} |
|
31 | - */ |
|
32 | - public function __get($name) |
|
33 | - { |
|
34 | - switch ($name) { |
|
35 | - case 'length': |
|
36 | - return $this->length; |
|
37 | - default: |
|
38 | - trigger_error(sprintf('Undefined property: %s::%s', static::class, $name)); |
|
39 | - } |
|
40 | - } |
|
27 | + /** |
|
28 | + * To allow access to length in the same way that DOMNodeList allows. |
|
29 | + * |
|
30 | + * {@inheritdoc} |
|
31 | + */ |
|
32 | + public function __get($name) |
|
33 | + { |
|
34 | + switch ($name) { |
|
35 | + case 'length': |
|
36 | + return $this->length; |
|
37 | + default: |
|
38 | + trigger_error(sprintf('Undefined property: %s::%s', static::class, $name)); |
|
39 | + } |
|
40 | + } |
|
41 | 41 | |
42 | - /** |
|
43 | - * @param DOMNode|DOMElement|DOMComment $node |
|
44 | - * |
|
45 | - * @return DOMNodeList |
|
46 | - */ |
|
47 | - public function add($node) |
|
48 | - { |
|
49 | - $this->items[] = $node; |
|
50 | - $this->length++; |
|
42 | + /** |
|
43 | + * @param DOMNode|DOMElement|DOMComment $node |
|
44 | + * |
|
45 | + * @return DOMNodeList |
|
46 | + */ |
|
47 | + public function add($node) |
|
48 | + { |
|
49 | + $this->items[] = $node; |
|
50 | + $this->length++; |
|
51 | 51 | |
52 | - return $this; |
|
53 | - } |
|
52 | + return $this; |
|
53 | + } |
|
54 | 54 | |
55 | - /** |
|
56 | - * @param int $offset |
|
57 | - * |
|
58 | - * @return DOMNode|DOMElement|DOMComment |
|
59 | - */ |
|
60 | - public function item(int $offset) |
|
61 | - { |
|
62 | - return $this->items[$offset]; |
|
63 | - } |
|
55 | + /** |
|
56 | + * @param int $offset |
|
57 | + * |
|
58 | + * @return DOMNode|DOMElement|DOMComment |
|
59 | + */ |
|
60 | + public function item(int $offset) |
|
61 | + { |
|
62 | + return $this->items[$offset]; |
|
63 | + } |
|
64 | 64 | |
65 | - /** |
|
66 | - * @return int|void |
|
67 | - */ |
|
68 | - public function count(): int |
|
69 | - { |
|
70 | - return $this->length; |
|
71 | - } |
|
65 | + /** |
|
66 | + * @return int|void |
|
67 | + */ |
|
68 | + public function count(): int |
|
69 | + { |
|
70 | + return $this->length; |
|
71 | + } |
|
72 | 72 | |
73 | - /** |
|
74 | - * To make it compatible with iterator_to_array() function. |
|
75 | - * |
|
76 | - * {@inheritdoc} |
|
77 | - */ |
|
78 | - public function getIterator(): \ArrayIterator |
|
79 | - { |
|
80 | - return new \ArrayIterator($this->items); |
|
81 | - } |
|
73 | + /** |
|
74 | + * To make it compatible with iterator_to_array() function. |
|
75 | + * |
|
76 | + * {@inheritdoc} |
|
77 | + */ |
|
78 | + public function getIterator(): \ArrayIterator |
|
79 | + { |
|
80 | + return new \ArrayIterator($this->items); |
|
81 | + } |
|
82 | 82 | } |
@@ -6,25 +6,25 @@ |
||
6 | 6 | |
7 | 7 | class DOMDocument extends \DOMDocument |
8 | 8 | { |
9 | - use NodeTrait; |
|
9 | + use NodeTrait; |
|
10 | 10 | |
11 | - public function __construct($version, $encoding) |
|
12 | - { |
|
13 | - parent::__construct($version, $encoding); |
|
11 | + public function __construct($version, $encoding) |
|
12 | + { |
|
13 | + parent::__construct($version, $encoding); |
|
14 | 14 | |
15 | - $this->registerNodeClass('DOMAttr', DOMAttr::class); |
|
16 | - $this->registerNodeClass('DOMCdataSection', DOMCdataSection::class); |
|
17 | - $this->registerNodeClass('DOMCharacterData', DOMCharacterData::class); |
|
18 | - $this->registerNodeClass('DOMComment', DOMComment::class); |
|
19 | - $this->registerNodeClass('DOMDocument', self::class); |
|
20 | - $this->registerNodeClass('DOMDocumentFragment', DOMDocumentFragment::class); |
|
21 | - $this->registerNodeClass('DOMDocumentType', DOMDocumentType::class); |
|
22 | - $this->registerNodeClass('DOMElement', DOMElement::class); |
|
23 | - $this->registerNodeClass('DOMEntity', DOMEntity::class); |
|
24 | - $this->registerNodeClass('DOMEntityReference', DOMEntityReference::class); |
|
25 | - $this->registerNodeClass('DOMNode', DOMNode::class); |
|
26 | - $this->registerNodeClass('DOMNotation', DOMNotation::class); |
|
27 | - $this->registerNodeClass('DOMProcessingInstruction', DOMProcessingInstruction::class); |
|
28 | - $this->registerNodeClass('DOMText', DOMText::class); |
|
29 | - } |
|
15 | + $this->registerNodeClass('DOMAttr', DOMAttr::class); |
|
16 | + $this->registerNodeClass('DOMCdataSection', DOMCdataSection::class); |
|
17 | + $this->registerNodeClass('DOMCharacterData', DOMCharacterData::class); |
|
18 | + $this->registerNodeClass('DOMComment', DOMComment::class); |
|
19 | + $this->registerNodeClass('DOMDocument', self::class); |
|
20 | + $this->registerNodeClass('DOMDocumentFragment', DOMDocumentFragment::class); |
|
21 | + $this->registerNodeClass('DOMDocumentType', DOMDocumentType::class); |
|
22 | + $this->registerNodeClass('DOMElement', DOMElement::class); |
|
23 | + $this->registerNodeClass('DOMEntity', DOMEntity::class); |
|
24 | + $this->registerNodeClass('DOMEntityReference', DOMEntityReference::class); |
|
25 | + $this->registerNodeClass('DOMNode', DOMNode::class); |
|
26 | + $this->registerNodeClass('DOMNotation', DOMNotation::class); |
|
27 | + $this->registerNodeClass('DOMProcessingInstruction', DOMProcessingInstruction::class); |
|
28 | + $this->registerNodeClass('DOMText', DOMText::class); |
|
29 | + } |
|
30 | 30 | } |
@@ -6,5 +6,5 @@ |
||
6 | 6 | |
7 | 7 | class DOMEntity extends \DOMEntity |
8 | 8 | { |
9 | - use NodeTrait; |
|
9 | + use NodeTrait; |
|
10 | 10 | } |
@@ -12,126 +12,126 @@ discard block |
||
12 | 12 | */ |
13 | 13 | class NodeUtility |
14 | 14 | { |
15 | - /** |
|
16 | - * Collection of regexps to check the node usability. |
|
17 | - * |
|
18 | - * @var array |
|
19 | - */ |
|
20 | - public static $regexps = [ |
|
21 | - 'unlikelyCandidates' => '/-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i', |
|
22 | - 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', |
|
23 | - 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i', |
|
24 | - 'byline' => '/byline|author|dateline|writtenby|p-author/i', |
|
25 | - 'replaceFonts' => '/<(\/?)font[^>]*>/gi', |
|
26 | - 'normalize' => '/\s{2,}/', |
|
27 | - 'videos' => '/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i', |
|
28 | - 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i', |
|
29 | - 'prevLink' => '/(prev|earl|old|new|<|«)/i', |
|
30 | - 'whitespace' => '/^\s*$/', |
|
31 | - 'hasContent' => '/\S$/', |
|
32 | - 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i', |
|
33 | - 'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i', |
|
34 | - // \x{00A0} is the unicode version of |
|
35 | - 'onlyWhitespace' => '/\x{00A0}|\s+/u' |
|
36 | - ]; |
|
37 | - |
|
38 | - /** |
|
39 | - * Imported from the Element class on league\html-to-markdown. |
|
40 | - * |
|
41 | - * @param $node |
|
42 | - * |
|
43 | - * @return DOMElement |
|
44 | - */ |
|
45 | - public static function nextElement($node) |
|
46 | - { |
|
47 | - $next = $node; |
|
48 | - while ($next |
|
49 | - && $next->nodeType !== XML_ELEMENT_NODE |
|
50 | - && $next->isWhitespace()) { |
|
51 | - $next = $next->nextSibling; |
|
52 | - } |
|
53 | - |
|
54 | - return $next; |
|
55 | - } |
|
56 | - |
|
57 | - /** |
|
58 | - * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new |
|
59 | - * element with the new tag name and importing it to the main DOMDocument. |
|
60 | - * |
|
61 | - * @param DOMNode $node |
|
62 | - * @param string $value |
|
63 | - * @param bool $importAttributes |
|
64 | - * |
|
65 | - * @return DOMNode |
|
66 | - */ |
|
67 | - public static function setNodeTag($node, $value, $importAttributes = true) |
|
68 | - { |
|
69 | - $new = new DOMDocument('1.0', 'utf-8'); |
|
70 | - $new->appendChild($new->createElement($value)); |
|
71 | - |
|
72 | - $children = $node->childNodes; |
|
73 | - /** @var $children \DOMNodeList $i */ |
|
74 | - for ($i = 0; $i < $children->length; $i++) { |
|
75 | - $import = $new->importNode($children->item($i), true); |
|
76 | - $new->firstChild->appendChild($import); |
|
77 | - } |
|
78 | - |
|
79 | - if ($importAttributes) { |
|
80 | - // Import attributes from the original node. |
|
81 | - foreach ($node->attributes as $attribute) { |
|
82 | - $new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue); |
|
83 | - } |
|
84 | - } |
|
85 | - |
|
86 | - // The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement. |
|
87 | - $import = $node->ownerDocument->importNode($new->firstChild, true); |
|
88 | - $node->parentNode->replaceChild($import, $node); |
|
89 | - |
|
90 | - return $import; |
|
91 | - } |
|
92 | - |
|
93 | - /** |
|
94 | - * Removes the current node and returns the next node to be parsed (child, sibling or parent). |
|
95 | - * |
|
96 | - * @param DOMNode $node |
|
97 | - * |
|
98 | - * @return DOMNode |
|
99 | - */ |
|
100 | - public static function removeAndGetNext($node) |
|
101 | - { |
|
102 | - $nextNode = self::getNextNode($node, true); |
|
103 | - $node->parentNode->removeChild($node); |
|
104 | - |
|
105 | - return $nextNode; |
|
106 | - } |
|
107 | - |
|
108 | - /** |
|
109 | - * Remove the selected node. |
|
110 | - * |
|
111 | - * @param $node DOMElement |
|
112 | - * |
|
113 | - * @return void |
|
114 | - **/ |
|
115 | - public static function removeNode($node) |
|
116 | - { |
|
117 | - $parent = $node->parentNode; |
|
118 | - if ($parent) { |
|
119 | - $parent->removeChild($node); |
|
120 | - } |
|
121 | - } |
|
122 | - |
|
123 | - /** |
|
124 | - * Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally |
|
125 | - * for parents. |
|
126 | - * |
|
127 | - * @param DOMNode $originalNode |
|
128 | - * @param bool $ignoreSelfAndKids |
|
129 | - * |
|
130 | - * @return DOMNode |
|
131 | - */ |
|
132 | - public static function getNextNode($originalNode, $ignoreSelfAndKids = false) |
|
133 | - { |
|
134 | - /* |
|
15 | + /** |
|
16 | + * Collection of regexps to check the node usability. |
|
17 | + * |
|
18 | + * @var array |
|
19 | + */ |
|
20 | + public static $regexps = [ |
|
21 | + 'unlikelyCandidates' => '/-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i', |
|
22 | + 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', |
|
23 | + 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i', |
|
24 | + 'byline' => '/byline|author|dateline|writtenby|p-author/i', |
|
25 | + 'replaceFonts' => '/<(\/?)font[^>]*>/gi', |
|
26 | + 'normalize' => '/\s{2,}/', |
|
27 | + 'videos' => '/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i', |
|
28 | + 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i', |
|
29 | + 'prevLink' => '/(prev|earl|old|new|<|«)/i', |
|
30 | + 'whitespace' => '/^\s*$/', |
|
31 | + 'hasContent' => '/\S$/', |
|
32 | + 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i', |
|
33 | + 'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i', |
|
34 | + // \x{00A0} is the unicode version of |
|
35 | + 'onlyWhitespace' => '/\x{00A0}|\s+/u' |
|
36 | + ]; |
|
37 | + |
|
38 | + /** |
|
39 | + * Imported from the Element class on league\html-to-markdown. |
|
40 | + * |
|
41 | + * @param $node |
|
42 | + * |
|
43 | + * @return DOMElement |
|
44 | + */ |
|
45 | + public static function nextElement($node) |
|
46 | + { |
|
47 | + $next = $node; |
|
48 | + while ($next |
|
49 | + && $next->nodeType !== XML_ELEMENT_NODE |
|
50 | + && $next->isWhitespace()) { |
|
51 | + $next = $next->nextSibling; |
|
52 | + } |
|
53 | + |
|
54 | + return $next; |
|
55 | + } |
|
56 | + |
|
57 | + /** |
|
58 | + * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new |
|
59 | + * element with the new tag name and importing it to the main DOMDocument. |
|
60 | + * |
|
61 | + * @param DOMNode $node |
|
62 | + * @param string $value |
|
63 | + * @param bool $importAttributes |
|
64 | + * |
|
65 | + * @return DOMNode |
|
66 | + */ |
|
67 | + public static function setNodeTag($node, $value, $importAttributes = true) |
|
68 | + { |
|
69 | + $new = new DOMDocument('1.0', 'utf-8'); |
|
70 | + $new->appendChild($new->createElement($value)); |
|
71 | + |
|
72 | + $children = $node->childNodes; |
|
73 | + /** @var $children \DOMNodeList $i */ |
|
74 | + for ($i = 0; $i < $children->length; $i++) { |
|
75 | + $import = $new->importNode($children->item($i), true); |
|
76 | + $new->firstChild->appendChild($import); |
|
77 | + } |
|
78 | + |
|
79 | + if ($importAttributes) { |
|
80 | + // Import attributes from the original node. |
|
81 | + foreach ($node->attributes as $attribute) { |
|
82 | + $new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue); |
|
83 | + } |
|
84 | + } |
|
85 | + |
|
86 | + // The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement. |
|
87 | + $import = $node->ownerDocument->importNode($new->firstChild, true); |
|
88 | + $node->parentNode->replaceChild($import, $node); |
|
89 | + |
|
90 | + return $import; |
|
91 | + } |
|
92 | + |
|
93 | + /** |
|
94 | + * Removes the current node and returns the next node to be parsed (child, sibling or parent). |
|
95 | + * |
|
96 | + * @param DOMNode $node |
|
97 | + * |
|
98 | + * @return DOMNode |
|
99 | + */ |
|
100 | + public static function removeAndGetNext($node) |
|
101 | + { |
|
102 | + $nextNode = self::getNextNode($node, true); |
|
103 | + $node->parentNode->removeChild($node); |
|
104 | + |
|
105 | + return $nextNode; |
|
106 | + } |
|
107 | + |
|
108 | + /** |
|
109 | + * Remove the selected node. |
|
110 | + * |
|
111 | + * @param $node DOMElement |
|
112 | + * |
|
113 | + * @return void |
|
114 | + **/ |
|
115 | + public static function removeNode($node) |
|
116 | + { |
|
117 | + $parent = $node->parentNode; |
|
118 | + if ($parent) { |
|
119 | + $parent->removeChild($node); |
|
120 | + } |
|
121 | + } |
|
122 | + |
|
123 | + /** |
|
124 | + * Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally |
|
125 | + * for parents. |
|
126 | + * |
|
127 | + * @param DOMNode $originalNode |
|
128 | + * @param bool $ignoreSelfAndKids |
|
129 | + * |
|
130 | + * @return DOMNode |
|
131 | + */ |
|
132 | + public static function getNextNode($originalNode, $ignoreSelfAndKids = false) |
|
133 | + { |
|
134 | + /* |
|
135 | 135 | * Traverse the DOM from node to node, starting at the node passed in. |
136 | 136 | * Pass true for the second parameter to indicate this node itself |
137 | 137 | * (and its kids) are going away, and we want the next node over. |
@@ -139,42 +139,42 @@ discard block |
||
139 | 139 | * Calling this in a loop will traverse the DOM depth-first. |
140 | 140 | */ |
141 | 141 | |
142 | - // First check for kids if those aren't being ignored |
|
143 | - if (!$ignoreSelfAndKids && $originalNode->firstChild) { |
|
144 | - return $originalNode->firstChild; |
|
145 | - } |
|
146 | - |
|
147 | - // Then for siblings... |
|
148 | - if ($originalNode->nextSibling) { |
|
149 | - return $originalNode->nextSibling; |
|
150 | - } |
|
151 | - |
|
152 | - // And finally, move up the parent chain *and* find a sibling |
|
153 | - // (because this is depth-first traversal, we will have already |
|
154 | - // seen the parent nodes themselves). |
|
155 | - do { |
|
156 | - $originalNode = $originalNode->parentNode; |
|
157 | - } while ($originalNode && !$originalNode->nextSibling); |
|
158 | - |
|
159 | - return ($originalNode) ? $originalNode->nextSibling : $originalNode; |
|
160 | - } |
|
161 | - |
|
162 | - /** |
|
163 | - * Remove all empty DOMNodes from DOMNodeLists. |
|
164 | - * |
|
165 | - * @param \DOMNodeList $list |
|
166 | - * |
|
167 | - * @return DOMNodeList |
|
168 | - */ |
|
169 | - public static function filterTextNodes(\DOMNodeList $list) |
|
170 | - { |
|
171 | - $newList = new DOMNodeList(); |
|
172 | - foreach ($list as $node) { |
|
173 | - if ($node->nodeType !== XML_TEXT_NODE || mb_strlen(trim($node->nodeValue))) { |
|
174 | - $newList->add($node); |
|
175 | - } |
|
176 | - } |
|
177 | - |
|
178 | - return $newList; |
|
179 | - } |
|
142 | + // First check for kids if those aren't being ignored |
|
143 | + if (!$ignoreSelfAndKids && $originalNode->firstChild) { |
|
144 | + return $originalNode->firstChild; |
|
145 | + } |
|
146 | + |
|
147 | + // Then for siblings... |
|
148 | + if ($originalNode->nextSibling) { |
|
149 | + return $originalNode->nextSibling; |
|
150 | + } |
|
151 | + |
|
152 | + // And finally, move up the parent chain *and* find a sibling |
|
153 | + // (because this is depth-first traversal, we will have already |
|
154 | + // seen the parent nodes themselves). |
|
155 | + do { |
|
156 | + $originalNode = $originalNode->parentNode; |
|
157 | + } while ($originalNode && !$originalNode->nextSibling); |
|
158 | + |
|
159 | + return ($originalNode) ? $originalNode->nextSibling : $originalNode; |
|
160 | + } |
|
161 | + |
|
162 | + /** |
|
163 | + * Remove all empty DOMNodes from DOMNodeLists. |
|
164 | + * |
|
165 | + * @param \DOMNodeList $list |
|
166 | + * |
|
167 | + * @return DOMNodeList |
|
168 | + */ |
|
169 | + public static function filterTextNodes(\DOMNodeList $list) |
|
170 | + { |
|
171 | + $newList = new DOMNodeList(); |
|
172 | + foreach ($list as $node) { |
|
173 | + if ($node->nodeType !== XML_TEXT_NODE || mb_strlen(trim($node->nodeValue))) { |
|
174 | + $newList->add($node); |
|
175 | + } |
|
176 | + } |
|
177 | + |
|
178 | + return $newList; |
|
179 | + } |
|
180 | 180 | } |
@@ -13,75 +13,75 @@ discard block |
||
13 | 13 | */ |
14 | 14 | trait NodeTrait |
15 | 15 | { |
16 | - /** |
|
17 | - * Content score of the node. Used to determine the value of the content. |
|
18 | - * |
|
19 | - * @var int |
|
20 | - */ |
|
21 | - public $contentScore = 0; |
|
22 | - |
|
23 | - /** |
|
24 | - * Flag for initialized status. |
|
25 | - * |
|
26 | - * @var bool |
|
27 | - */ |
|
28 | - private $initialized = false; |
|
29 | - |
|
30 | - /** |
|
31 | - * Flag data tables. |
|
32 | - * |
|
33 | - * @var bool |
|
34 | - */ |
|
35 | - private $readabilityDataTable = false; |
|
36 | - |
|
37 | - /** |
|
38 | - * @var array |
|
39 | - */ |
|
40 | - private $divToPElements = [ |
|
41 | - 'a', |
|
42 | - 'blockquote', |
|
43 | - 'dl', |
|
44 | - 'div', |
|
45 | - 'img', |
|
46 | - 'ol', |
|
47 | - 'p', |
|
48 | - 'pre', |
|
49 | - 'table', |
|
50 | - 'ul', |
|
51 | - 'select', |
|
52 | - ]; |
|
53 | - |
|
54 | - /** |
|
55 | - * The commented out elements qualify as phrasing content but tend to be |
|
56 | - * removed by readability when put into paragraphs, so we ignore them here. |
|
57 | - * |
|
58 | - * @var array |
|
59 | - */ |
|
60 | - private $phrasing_elems = [ |
|
61 | - // 'CANVAS', 'IFRAME', 'SVG', 'VIDEO', |
|
62 | - 'abbr', 'audio', 'b', 'bdo', 'br', 'button', 'cite', 'code', 'data', |
|
63 | - 'datalist', 'dfn', 'em', 'embed', 'i', 'img', 'input', 'kbd', 'label', |
|
64 | - 'mark', 'math', 'meter', 'noscript', 'object', 'output', 'progress', 'q', |
|
65 | - 'ruby', 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', |
|
66 | - 'sup', 'textarea', 'time', 'var', 'wbr' |
|
67 | - ]; |
|
68 | - |
|
69 | - /** |
|
70 | - * initialized getter. |
|
71 | - * |
|
72 | - * @return bool |
|
73 | - */ |
|
74 | - public function isInitialized() |
|
75 | - { |
|
76 | - return $this->initialized; |
|
77 | - } |
|
78 | - |
|
79 | - /** |
|
80 | - * @return bool |
|
81 | - */ |
|
82 | - public function isReadabilityDataTable() |
|
83 | - { |
|
84 | - /* |
|
16 | + /** |
|
17 | + * Content score of the node. Used to determine the value of the content. |
|
18 | + * |
|
19 | + * @var int |
|
20 | + */ |
|
21 | + public $contentScore = 0; |
|
22 | + |
|
23 | + /** |
|
24 | + * Flag for initialized status. |
|
25 | + * |
|
26 | + * @var bool |
|
27 | + */ |
|
28 | + private $initialized = false; |
|
29 | + |
|
30 | + /** |
|
31 | + * Flag data tables. |
|
32 | + * |
|
33 | + * @var bool |
|
34 | + */ |
|
35 | + private $readabilityDataTable = false; |
|
36 | + |
|
37 | + /** |
|
38 | + * @var array |
|
39 | + */ |
|
40 | + private $divToPElements = [ |
|
41 | + 'a', |
|
42 | + 'blockquote', |
|
43 | + 'dl', |
|
44 | + 'div', |
|
45 | + 'img', |
|
46 | + 'ol', |
|
47 | + 'p', |
|
48 | + 'pre', |
|
49 | + 'table', |
|
50 | + 'ul', |
|
51 | + 'select', |
|
52 | + ]; |
|
53 | + |
|
54 | + /** |
|
55 | + * The commented out elements qualify as phrasing content but tend to be |
|
56 | + * removed by readability when put into paragraphs, so we ignore them here. |
|
57 | + * |
|
58 | + * @var array |
|
59 | + */ |
|
60 | + private $phrasing_elems = [ |
|
61 | + // 'CANVAS', 'IFRAME', 'SVG', 'VIDEO', |
|
62 | + 'abbr', 'audio', 'b', 'bdo', 'br', 'button', 'cite', 'code', 'data', |
|
63 | + 'datalist', 'dfn', 'em', 'embed', 'i', 'img', 'input', 'kbd', 'label', |
|
64 | + 'mark', 'math', 'meter', 'noscript', 'object', 'output', 'progress', 'q', |
|
65 | + 'ruby', 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', |
|
66 | + 'sup', 'textarea', 'time', 'var', 'wbr' |
|
67 | + ]; |
|
68 | + |
|
69 | + /** |
|
70 | + * initialized getter. |
|
71 | + * |
|
72 | + * @return bool |
|
73 | + */ |
|
74 | + public function isInitialized() |
|
75 | + { |
|
76 | + return $this->initialized; |
|
77 | + } |
|
78 | + |
|
79 | + /** |
|
80 | + * @return bool |
|
81 | + */ |
|
82 | + public function isReadabilityDataTable() |
|
83 | + { |
|
84 | + /* |
|
85 | 85 | * This is a workaround that I'd like to remove in the future. |
86 | 86 | * Seems that although we are extending the base DOMElement and adding custom properties (like this one, |
87 | 87 | * 'readabilityDataTable'), these properties get lost when you search for elements with getElementsByTagName. |
@@ -91,388 +91,388 @@ discard block |
||
91 | 91 | * |
92 | 92 | * @see https://stackoverflow.com/questions/35654709/php-registernodeclass-and-reusing-variable-names |
93 | 93 | */ |
94 | - return $this->hasAttribute('readabilityDataTable') |
|
95 | - && $this->getAttribute('readabilityDataTable') === '1'; |
|
94 | + return $this->hasAttribute('readabilityDataTable') |
|
95 | + && $this->getAttribute('readabilityDataTable') === '1'; |
|
96 | 96 | // return $this->readabilityDataTable; |
97 | - } |
|
98 | - |
|
99 | - /** |
|
100 | - * @param bool $param |
|
101 | - */ |
|
102 | - public function setReadabilityDataTable($param) |
|
103 | - { |
|
104 | - // Can't be "true" because DOMDocument casts it to "1" |
|
105 | - $this->setAttribute('readabilityDataTable', $param ? '1' : '0'); |
|
97 | + } |
|
98 | + |
|
99 | + /** |
|
100 | + * @param bool $param |
|
101 | + */ |
|
102 | + public function setReadabilityDataTable($param) |
|
103 | + { |
|
104 | + // Can't be "true" because DOMDocument casts it to "1" |
|
105 | + $this->setAttribute('readabilityDataTable', $param ? '1' : '0'); |
|
106 | 106 | // $this->readabilityDataTable = $param; |
107 | - } |
|
108 | - |
|
109 | - /** |
|
110 | - * Initializer. Calculates the current score of the node and returns a full Readability object. |
|
111 | - * |
|
112 | - * @ TODO: I don't like the weightClasses param. How can we get the config here? |
|
113 | - * |
|
114 | - * @param $weightClasses bool Weight classes? |
|
115 | - * |
|
116 | - * @return static |
|
117 | - */ |
|
118 | - public function initializeNode($weightClasses) |
|
119 | - { |
|
120 | - if (!$this->isInitialized()) { |
|
121 | - $contentScore = 0; |
|
122 | - |
|
123 | - switch ($this->nodeName) { |
|
124 | - case 'div': |
|
125 | - $contentScore += 5; |
|
126 | - break; |
|
127 | - |
|
128 | - case 'pre': |
|
129 | - case 'td': |
|
130 | - case 'blockquote': |
|
131 | - $contentScore += 3; |
|
132 | - break; |
|
133 | - |
|
134 | - case 'address': |
|
135 | - case 'ol': |
|
136 | - case 'ul': |
|
137 | - case 'dl': |
|
138 | - case 'dd': |
|
139 | - case 'dt': |
|
140 | - case 'li': |
|
141 | - case 'form': |
|
142 | - $contentScore -= 3; |
|
143 | - break; |
|
144 | - |
|
145 | - case 'h1': |
|
146 | - case 'h2': |
|
147 | - case 'h3': |
|
148 | - case 'h4': |
|
149 | - case 'h5': |
|
150 | - case 'h6': |
|
151 | - case 'th': |
|
152 | - $contentScore -= 5; |
|
153 | - break; |
|
154 | - } |
|
155 | - |
|
156 | - $this->contentScore = $contentScore + ($weightClasses ? $this->getClassWeight() : 0); |
|
157 | - |
|
158 | - $this->initialized = true; |
|
159 | - } |
|
160 | - |
|
161 | - return $this; |
|
162 | - } |
|
163 | - |
|
164 | - /** |
|
165 | - * Override for native getAttribute method. Some nodes have the getAttribute method, some don't, so we need |
|
166 | - * to check first the existence of the attributes property. |
|
167 | - * |
|
168 | - * @param $attributeName string Attribute to retrieve |
|
169 | - * |
|
170 | - * @return string |
|
171 | - */ |
|
172 | - public function getAttribute($attributeName) |
|
173 | - { |
|
174 | - if (!is_null($this->attributes)) { |
|
175 | - return parent::getAttribute($attributeName); |
|
176 | - } |
|
177 | - |
|
178 | - return ''; |
|
179 | - } |
|
180 | - |
|
181 | - /** |
|
182 | - * Override for native hasAttribute. |
|
183 | - * |
|
184 | - * @param $attributeName |
|
185 | - * |
|
186 | - * @return bool |
|
187 | - * |
|
188 | - * @see getAttribute |
|
189 | - */ |
|
190 | - public function hasAttribute($attributeName) |
|
191 | - { |
|
192 | - if (!is_null($this->attributes)) { |
|
193 | - return parent::hasAttribute($attributeName); |
|
194 | - } |
|
195 | - |
|
196 | - return false; |
|
197 | - } |
|
198 | - |
|
199 | - /** |
|
200 | - * Get the ancestors of the current node. |
|
201 | - * |
|
202 | - * @param int|bool $maxLevel Max amount of ancestors to get. False for all of them |
|
203 | - * |
|
204 | - * @return array |
|
205 | - */ |
|
206 | - public function getNodeAncestors($maxLevel = 3) |
|
207 | - { |
|
208 | - $ancestors = []; |
|
209 | - $level = 0; |
|
210 | - |
|
211 | - $node = $this->parentNode; |
|
212 | - |
|
213 | - while ($node && !($node instanceof DOMDocument)) { |
|
214 | - $ancestors[] = $node; |
|
215 | - $level++; |
|
216 | - if ($level === $maxLevel) { |
|
217 | - break; |
|
218 | - } |
|
219 | - $node = $node->parentNode; |
|
220 | - } |
|
221 | - |
|
222 | - return $ancestors; |
|
223 | - } |
|
224 | - |
|
225 | - /** |
|
226 | - * Returns all links from the current element. |
|
227 | - * |
|
228 | - * @return array |
|
229 | - */ |
|
230 | - public function getAllLinks() |
|
231 | - { |
|
232 | - return iterator_to_array($this->getElementsByTagName('a')); |
|
233 | - } |
|
234 | - |
|
235 | - /** |
|
236 | - * Get the density of links as a percentage of the content |
|
237 | - * This is the amount of text that is inside a link divided by the total text in the node. |
|
238 | - * |
|
239 | - * @return int |
|
240 | - */ |
|
241 | - public function getLinkDensity() |
|
242 | - { |
|
243 | - $linkLength = 0; |
|
244 | - $textLength = mb_strlen($this->getTextContent(true)); |
|
245 | - |
|
246 | - if (!$textLength) { |
|
247 | - return 0; |
|
248 | - } |
|
249 | - |
|
250 | - $links = $this->getAllLinks(); |
|
251 | - |
|
252 | - if ($links) { |
|
253 | - /** @var DOMElement $link */ |
|
254 | - foreach ($links as $link) { |
|
255 | - $linkLength += mb_strlen($link->getTextContent(true)); |
|
256 | - } |
|
257 | - } |
|
258 | - |
|
259 | - return $linkLength / $textLength; |
|
260 | - } |
|
261 | - |
|
262 | - /** |
|
263 | - * Calculates the weight of the class/id of the current element. |
|
264 | - * |
|
265 | - * @return int |
|
266 | - */ |
|
267 | - public function getClassWeight() |
|
268 | - { |
|
269 | - $weight = 0; |
|
270 | - |
|
271 | - // Look for a special classname |
|
272 | - $class = $this->getAttribute('class'); |
|
273 | - if (trim($class)) { |
|
274 | - if (preg_match(NodeUtility::$regexps['negative'], $class)) { |
|
275 | - $weight -= 25; |
|
276 | - } |
|
277 | - |
|
278 | - if (preg_match(NodeUtility::$regexps['positive'], $class)) { |
|
279 | - $weight += 25; |
|
280 | - } |
|
281 | - } |
|
282 | - |
|
283 | - // Look for a special ID |
|
284 | - $id = $this->getAttribute('id'); |
|
285 | - if (trim($id)) { |
|
286 | - if (preg_match(NodeUtility::$regexps['negative'], $id)) { |
|
287 | - $weight -= 25; |
|
288 | - } |
|
289 | - |
|
290 | - if (preg_match(NodeUtility::$regexps['positive'], $id)) { |
|
291 | - $weight += 25; |
|
292 | - } |
|
293 | - } |
|
294 | - |
|
295 | - return $weight; |
|
296 | - } |
|
297 | - |
|
298 | - /** |
|
299 | - * Returns the full text of the node. |
|
300 | - * |
|
301 | - * @param bool $normalize Normalize white space? |
|
302 | - * |
|
303 | - * @return string |
|
304 | - */ |
|
305 | - public function getTextContent($normalize = false) |
|
306 | - { |
|
307 | - $nodeValue = $this->nodeValue; |
|
308 | - if ($normalize) { |
|
309 | - $nodeValue = trim(preg_replace('/\s{2,}/', ' ', $nodeValue)); |
|
310 | - } |
|
311 | - |
|
312 | - return $nodeValue; |
|
313 | - } |
|
314 | - |
|
315 | - /** |
|
316 | - * Returns the children of the current node. |
|
317 | - * |
|
318 | - * @param bool $filterEmptyDOMText Filter empty DOMText nodes? |
|
319 | - * |
|
320 | - * @deprecated Use NodeUtility::filterTextNodes, function will be removed in version 3.0 |
|
321 | - * |
|
322 | - * @return array |
|
323 | - */ |
|
324 | - public function getChildren($filterEmptyDOMText = false) |
|
325 | - { |
|
326 | - @trigger_error('getChildren was replaced with NodeUtility::filterTextNodes and will be removed in version 3.0', E_USER_DEPRECATED); |
|
327 | - |
|
328 | - $ret = iterator_to_array($this->childNodes); |
|
329 | - if ($filterEmptyDOMText) { |
|
330 | - // Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number |
|
331 | - $ret = array_values(array_filter($ret, function ($node) { |
|
332 | - return $node->nodeName !== '#text' || mb_strlen(trim($node->nodeValue)); |
|
333 | - })); |
|
334 | - } |
|
335 | - |
|
336 | - return $ret; |
|
337 | - } |
|
338 | - |
|
339 | - /** |
|
340 | - * Return an array indicating how many rows and columns this table has. |
|
341 | - * |
|
342 | - * @return array |
|
343 | - */ |
|
344 | - public function getRowAndColumnCount() |
|
345 | - { |
|
346 | - $rows = $columns = 0; |
|
347 | - $trs = $this->getElementsByTagName('tr'); |
|
348 | - foreach ($trs as $tr) { |
|
349 | - /** @var \DOMElement $tr */ |
|
350 | - $rowspan = $tr->getAttribute('rowspan'); |
|
351 | - $rows += ($rowspan || 1); |
|
352 | - |
|
353 | - // Now look for column-related info |
|
354 | - $columnsInThisRow = 0; |
|
355 | - $cells = $tr->getElementsByTagName('td'); |
|
356 | - foreach ($cells as $cell) { |
|
357 | - /** @var \DOMElement $cell */ |
|
358 | - $colspan = $cell->getAttribute('colspan'); |
|
359 | - $columnsInThisRow += ($colspan || 1); |
|
360 | - } |
|
361 | - $columns = max($columns, $columnsInThisRow); |
|
362 | - } |
|
363 | - |
|
364 | - return ['rows' => $rows, 'columns' => $columns]; |
|
365 | - } |
|
366 | - |
|
367 | - /** |
|
368 | - * Creates a new node based on the text content of the original node. |
|
369 | - * |
|
370 | - * @param $originalNode DOMNode |
|
371 | - * @param $tagName string |
|
372 | - * |
|
373 | - * @return DOMElement |
|
374 | - */ |
|
375 | - public function createNode($originalNode, $tagName) |
|
376 | - { |
|
377 | - $text = $originalNode->getTextContent(); |
|
378 | - $newNode = $originalNode->ownerDocument->createElement($tagName, $text); |
|
379 | - |
|
380 | - return $newNode; |
|
381 | - } |
|
382 | - |
|
383 | - /** |
|
384 | - * Check if a given node has one of its ancestor tag name matching the |
|
385 | - * provided one. |
|
386 | - * |
|
387 | - * @param string $tagName |
|
388 | - * @param int $maxDepth |
|
389 | - * @param callable $filterFn |
|
390 | - * |
|
391 | - * @return bool |
|
392 | - */ |
|
393 | - public function hasAncestorTag($tagName, $maxDepth = 3, callable $filterFn = null) |
|
394 | - { |
|
395 | - $depth = 0; |
|
396 | - $node = $this; |
|
397 | - |
|
398 | - while ($node->parentNode) { |
|
399 | - if ($maxDepth > 0 && $depth > $maxDepth) { |
|
400 | - return false; |
|
401 | - } |
|
402 | - |
|
403 | - if ($node->parentNode->nodeName === $tagName && (!$filterFn || $filterFn($node->parentNode))) { |
|
404 | - return true; |
|
405 | - } |
|
406 | - |
|
407 | - $node = $node->parentNode; |
|
408 | - $depth++; |
|
409 | - } |
|
410 | - |
|
411 | - return false; |
|
412 | - } |
|
413 | - |
|
414 | - /** |
|
415 | - * Check if this node has only whitespace and a single element with given tag |
|
416 | - * or if it contains no element with given tag or more than 1 element. |
|
417 | - * |
|
418 | - * @param $tag string Name of tag |
|
419 | - * |
|
420 | - * @return bool |
|
421 | - */ |
|
422 | - public function hasSingleTagInsideElement($tag) |
|
423 | - { |
|
424 | - // There should be exactly 1 element child with given tag |
|
425 | - if (count($children = NodeUtility::filterTextNodes($this->childNodes)) !== 1 || $children->item(0)->nodeName !== $tag) { |
|
426 | - return false; |
|
427 | - } |
|
428 | - |
|
429 | - // And there should be no text nodes with real content |
|
430 | - return array_reduce(iterator_to_array($children), function ($carry, $child) { |
|
431 | - if (!$carry === false) { |
|
432 | - return false; |
|
433 | - } |
|
434 | - |
|
435 | - /* @var DOMNode $child */ |
|
436 | - return !($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent())); |
|
437 | - }); |
|
438 | - } |
|
439 | - |
|
440 | - /** |
|
441 | - * Check if the current element has a single child block element. |
|
442 | - * Block elements are the ones defined in the divToPElements array. |
|
443 | - * |
|
444 | - * @return bool |
|
445 | - */ |
|
446 | - public function hasSingleChildBlockElement() |
|
447 | - { |
|
448 | - $result = false; |
|
449 | - if ($this->hasChildNodes()) { |
|
450 | - foreach ($this->childNodes as $child) { |
|
451 | - if (in_array($child->nodeName, $this->divToPElements)) { |
|
452 | - $result = true; |
|
453 | - } else { |
|
454 | - // If any of the hasSingleChildBlockElement calls return true, return true then. |
|
455 | - /** @var $child DOMElement */ |
|
456 | - $result = ($result || $child->hasSingleChildBlockElement()); |
|
457 | - } |
|
458 | - } |
|
459 | - } |
|
460 | - |
|
461 | - return $result; |
|
462 | - } |
|
463 | - |
|
464 | - /** |
|
465 | - * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace. |
|
466 | - * |
|
467 | - * @return bool |
|
468 | - */ |
|
469 | - public function isElementWithoutContent() |
|
470 | - { |
|
471 | - return $this instanceof DOMElement && |
|
472 | - mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 && |
|
473 | - ($this->childNodes->length === 0 || |
|
474 | - $this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length |
|
475 | - /* |
|
107 | + } |
|
108 | + |
|
109 | + /** |
|
110 | + * Initializer. Calculates the current score of the node and returns a full Readability object. |
|
111 | + * |
|
112 | + * @ TODO: I don't like the weightClasses param. How can we get the config here? |
|
113 | + * |
|
114 | + * @param $weightClasses bool Weight classes? |
|
115 | + * |
|
116 | + * @return static |
|
117 | + */ |
|
118 | + public function initializeNode($weightClasses) |
|
119 | + { |
|
120 | + if (!$this->isInitialized()) { |
|
121 | + $contentScore = 0; |
|
122 | + |
|
123 | + switch ($this->nodeName) { |
|
124 | + case 'div': |
|
125 | + $contentScore += 5; |
|
126 | + break; |
|
127 | + |
|
128 | + case 'pre': |
|
129 | + case 'td': |
|
130 | + case 'blockquote': |
|
131 | + $contentScore += 3; |
|
132 | + break; |
|
133 | + |
|
134 | + case 'address': |
|
135 | + case 'ol': |
|
136 | + case 'ul': |
|
137 | + case 'dl': |
|
138 | + case 'dd': |
|
139 | + case 'dt': |
|
140 | + case 'li': |
|
141 | + case 'form': |
|
142 | + $contentScore -= 3; |
|
143 | + break; |
|
144 | + |
|
145 | + case 'h1': |
|
146 | + case 'h2': |
|
147 | + case 'h3': |
|
148 | + case 'h4': |
|
149 | + case 'h5': |
|
150 | + case 'h6': |
|
151 | + case 'th': |
|
152 | + $contentScore -= 5; |
|
153 | + break; |
|
154 | + } |
|
155 | + |
|
156 | + $this->contentScore = $contentScore + ($weightClasses ? $this->getClassWeight() : 0); |
|
157 | + |
|
158 | + $this->initialized = true; |
|
159 | + } |
|
160 | + |
|
161 | + return $this; |
|
162 | + } |
|
163 | + |
|
164 | + /** |
|
165 | + * Override for native getAttribute method. Some nodes have the getAttribute method, some don't, so we need |
|
166 | + * to check first the existence of the attributes property. |
|
167 | + * |
|
168 | + * @param $attributeName string Attribute to retrieve |
|
169 | + * |
|
170 | + * @return string |
|
171 | + */ |
|
172 | + public function getAttribute($attributeName) |
|
173 | + { |
|
174 | + if (!is_null($this->attributes)) { |
|
175 | + return parent::getAttribute($attributeName); |
|
176 | + } |
|
177 | + |
|
178 | + return ''; |
|
179 | + } |
|
180 | + |
|
181 | + /** |
|
182 | + * Override for native hasAttribute. |
|
183 | + * |
|
184 | + * @param $attributeName |
|
185 | + * |
|
186 | + * @return bool |
|
187 | + * |
|
188 | + * @see getAttribute |
|
189 | + */ |
|
190 | + public function hasAttribute($attributeName) |
|
191 | + { |
|
192 | + if (!is_null($this->attributes)) { |
|
193 | + return parent::hasAttribute($attributeName); |
|
194 | + } |
|
195 | + |
|
196 | + return false; |
|
197 | + } |
|
198 | + |
|
199 | + /** |
|
200 | + * Get the ancestors of the current node. |
|
201 | + * |
|
202 | + * @param int|bool $maxLevel Max amount of ancestors to get. False for all of them |
|
203 | + * |
|
204 | + * @return array |
|
205 | + */ |
|
206 | + public function getNodeAncestors($maxLevel = 3) |
|
207 | + { |
|
208 | + $ancestors = []; |
|
209 | + $level = 0; |
|
210 | + |
|
211 | + $node = $this->parentNode; |
|
212 | + |
|
213 | + while ($node && !($node instanceof DOMDocument)) { |
|
214 | + $ancestors[] = $node; |
|
215 | + $level++; |
|
216 | + if ($level === $maxLevel) { |
|
217 | + break; |
|
218 | + } |
|
219 | + $node = $node->parentNode; |
|
220 | + } |
|
221 | + |
|
222 | + return $ancestors; |
|
223 | + } |
|
224 | + |
|
225 | + /** |
|
226 | + * Returns all links from the current element. |
|
227 | + * |
|
228 | + * @return array |
|
229 | + */ |
|
230 | + public function getAllLinks() |
|
231 | + { |
|
232 | + return iterator_to_array($this->getElementsByTagName('a')); |
|
233 | + } |
|
234 | + |
|
235 | + /** |
|
236 | + * Get the density of links as a percentage of the content |
|
237 | + * This is the amount of text that is inside a link divided by the total text in the node. |
|
238 | + * |
|
239 | + * @return int |
|
240 | + */ |
|
241 | + public function getLinkDensity() |
|
242 | + { |
|
243 | + $linkLength = 0; |
|
244 | + $textLength = mb_strlen($this->getTextContent(true)); |
|
245 | + |
|
246 | + if (!$textLength) { |
|
247 | + return 0; |
|
248 | + } |
|
249 | + |
|
250 | + $links = $this->getAllLinks(); |
|
251 | + |
|
252 | + if ($links) { |
|
253 | + /** @var DOMElement $link */ |
|
254 | + foreach ($links as $link) { |
|
255 | + $linkLength += mb_strlen($link->getTextContent(true)); |
|
256 | + } |
|
257 | + } |
|
258 | + |
|
259 | + return $linkLength / $textLength; |
|
260 | + } |
|
261 | + |
|
262 | + /** |
|
263 | + * Calculates the weight of the class/id of the current element. |
|
264 | + * |
|
265 | + * @return int |
|
266 | + */ |
|
267 | + public function getClassWeight() |
|
268 | + { |
|
269 | + $weight = 0; |
|
270 | + |
|
271 | + // Look for a special classname |
|
272 | + $class = $this->getAttribute('class'); |
|
273 | + if (trim($class)) { |
|
274 | + if (preg_match(NodeUtility::$regexps['negative'], $class)) { |
|
275 | + $weight -= 25; |
|
276 | + } |
|
277 | + |
|
278 | + if (preg_match(NodeUtility::$regexps['positive'], $class)) { |
|
279 | + $weight += 25; |
|
280 | + } |
|
281 | + } |
|
282 | + |
|
283 | + // Look for a special ID |
|
284 | + $id = $this->getAttribute('id'); |
|
285 | + if (trim($id)) { |
|
286 | + if (preg_match(NodeUtility::$regexps['negative'], $id)) { |
|
287 | + $weight -= 25; |
|
288 | + } |
|
289 | + |
|
290 | + if (preg_match(NodeUtility::$regexps['positive'], $id)) { |
|
291 | + $weight += 25; |
|
292 | + } |
|
293 | + } |
|
294 | + |
|
295 | + return $weight; |
|
296 | + } |
|
297 | + |
|
298 | + /** |
|
299 | + * Returns the full text of the node. |
|
300 | + * |
|
301 | + * @param bool $normalize Normalize white space? |
|
302 | + * |
|
303 | + * @return string |
|
304 | + */ |
|
305 | + public function getTextContent($normalize = false) |
|
306 | + { |
|
307 | + $nodeValue = $this->nodeValue; |
|
308 | + if ($normalize) { |
|
309 | + $nodeValue = trim(preg_replace('/\s{2,}/', ' ', $nodeValue)); |
|
310 | + } |
|
311 | + |
|
312 | + return $nodeValue; |
|
313 | + } |
|
314 | + |
|
315 | + /** |
|
316 | + * Returns the children of the current node. |
|
317 | + * |
|
318 | + * @param bool $filterEmptyDOMText Filter empty DOMText nodes? |
|
319 | + * |
|
320 | + * @deprecated Use NodeUtility::filterTextNodes, function will be removed in version 3.0 |
|
321 | + * |
|
322 | + * @return array |
|
323 | + */ |
|
324 | + public function getChildren($filterEmptyDOMText = false) |
|
325 | + { |
|
326 | + @trigger_error('getChildren was replaced with NodeUtility::filterTextNodes and will be removed in version 3.0', E_USER_DEPRECATED); |
|
327 | + |
|
328 | + $ret = iterator_to_array($this->childNodes); |
|
329 | + if ($filterEmptyDOMText) { |
|
330 | + // Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number |
|
331 | + $ret = array_values(array_filter($ret, function ($node) { |
|
332 | + return $node->nodeName !== '#text' || mb_strlen(trim($node->nodeValue)); |
|
333 | + })); |
|
334 | + } |
|
335 | + |
|
336 | + return $ret; |
|
337 | + } |
|
338 | + |
|
339 | + /** |
|
340 | + * Return an array indicating how many rows and columns this table has. |
|
341 | + * |
|
342 | + * @return array |
|
343 | + */ |
|
344 | + public function getRowAndColumnCount() |
|
345 | + { |
|
346 | + $rows = $columns = 0; |
|
347 | + $trs = $this->getElementsByTagName('tr'); |
|
348 | + foreach ($trs as $tr) { |
|
349 | + /** @var \DOMElement $tr */ |
|
350 | + $rowspan = $tr->getAttribute('rowspan'); |
|
351 | + $rows += ($rowspan || 1); |
|
352 | + |
|
353 | + // Now look for column-related info |
|
354 | + $columnsInThisRow = 0; |
|
355 | + $cells = $tr->getElementsByTagName('td'); |
|
356 | + foreach ($cells as $cell) { |
|
357 | + /** @var \DOMElement $cell */ |
|
358 | + $colspan = $cell->getAttribute('colspan'); |
|
359 | + $columnsInThisRow += ($colspan || 1); |
|
360 | + } |
|
361 | + $columns = max($columns, $columnsInThisRow); |
|
362 | + } |
|
363 | + |
|
364 | + return ['rows' => $rows, 'columns' => $columns]; |
|
365 | + } |
|
366 | + |
|
367 | + /** |
|
368 | + * Creates a new node based on the text content of the original node. |
|
369 | + * |
|
370 | + * @param $originalNode DOMNode |
|
371 | + * @param $tagName string |
|
372 | + * |
|
373 | + * @return DOMElement |
|
374 | + */ |
|
375 | + public function createNode($originalNode, $tagName) |
|
376 | + { |
|
377 | + $text = $originalNode->getTextContent(); |
|
378 | + $newNode = $originalNode->ownerDocument->createElement($tagName, $text); |
|
379 | + |
|
380 | + return $newNode; |
|
381 | + } |
|
382 | + |
|
383 | + /** |
|
384 | + * Check if a given node has one of its ancestor tag name matching the |
|
385 | + * provided one. |
|
386 | + * |
|
387 | + * @param string $tagName |
|
388 | + * @param int $maxDepth |
|
389 | + * @param callable $filterFn |
|
390 | + * |
|
391 | + * @return bool |
|
392 | + */ |
|
393 | + public function hasAncestorTag($tagName, $maxDepth = 3, callable $filterFn = null) |
|
394 | + { |
|
395 | + $depth = 0; |
|
396 | + $node = $this; |
|
397 | + |
|
398 | + while ($node->parentNode) { |
|
399 | + if ($maxDepth > 0 && $depth > $maxDepth) { |
|
400 | + return false; |
|
401 | + } |
|
402 | + |
|
403 | + if ($node->parentNode->nodeName === $tagName && (!$filterFn || $filterFn($node->parentNode))) { |
|
404 | + return true; |
|
405 | + } |
|
406 | + |
|
407 | + $node = $node->parentNode; |
|
408 | + $depth++; |
|
409 | + } |
|
410 | + |
|
411 | + return false; |
|
412 | + } |
|
413 | + |
|
414 | + /** |
|
415 | + * Check if this node has only whitespace and a single element with given tag |
|
416 | + * or if it contains no element with given tag or more than 1 element. |
|
417 | + * |
|
418 | + * @param $tag string Name of tag |
|
419 | + * |
|
420 | + * @return bool |
|
421 | + */ |
|
422 | + public function hasSingleTagInsideElement($tag) |
|
423 | + { |
|
424 | + // There should be exactly 1 element child with given tag |
|
425 | + if (count($children = NodeUtility::filterTextNodes($this->childNodes)) !== 1 || $children->item(0)->nodeName !== $tag) { |
|
426 | + return false; |
|
427 | + } |
|
428 | + |
|
429 | + // And there should be no text nodes with real content |
|
430 | + return array_reduce(iterator_to_array($children), function ($carry, $child) { |
|
431 | + if (!$carry === false) { |
|
432 | + return false; |
|
433 | + } |
|
434 | + |
|
435 | + /* @var DOMNode $child */ |
|
436 | + return !($child->nodeType === XML_TEXT_NODE && !preg_match('/\S$/', $child->getTextContent())); |
|
437 | + }); |
|
438 | + } |
|
439 | + |
|
440 | + /** |
|
441 | + * Check if the current element has a single child block element. |
|
442 | + * Block elements are the ones defined in the divToPElements array. |
|
443 | + * |
|
444 | + * @return bool |
|
445 | + */ |
|
446 | + public function hasSingleChildBlockElement() |
|
447 | + { |
|
448 | + $result = false; |
|
449 | + if ($this->hasChildNodes()) { |
|
450 | + foreach ($this->childNodes as $child) { |
|
451 | + if (in_array($child->nodeName, $this->divToPElements)) { |
|
452 | + $result = true; |
|
453 | + } else { |
|
454 | + // If any of the hasSingleChildBlockElement calls return true, return true then. |
|
455 | + /** @var $child DOMElement */ |
|
456 | + $result = ($result || $child->hasSingleChildBlockElement()); |
|
457 | + } |
|
458 | + } |
|
459 | + } |
|
460 | + |
|
461 | + return $result; |
|
462 | + } |
|
463 | + |
|
464 | + /** |
|
465 | + * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace. |
|
466 | + * |
|
467 | + * @return bool |
|
468 | + */ |
|
469 | + public function isElementWithoutContent() |
|
470 | + { |
|
471 | + return $this instanceof DOMElement && |
|
472 | + mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 && |
|
473 | + ($this->childNodes->length === 0 || |
|
474 | + $this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length |
|
475 | + /* |
|
476 | 476 | * Special PHP DOMDocument case: We also need to count how many DOMText we have inside the node. |
477 | 477 | * If there's an empty tag with an space inside and a BR (for example "<p> <br/></p>) counting only BRs and |
478 | 478 | * HRs will will say that the example has 2 nodes, instead of one. This happens because in DOMDocument, |
@@ -480,108 +480,108 @@ discard block |
||
480 | 480 | * are dealing with (And at this point we know they are empty or are just whitespace, because of the |
481 | 481 | * mb_strlen in this chain of checks). |
482 | 482 | */ |
483 | - + count(array_filter(iterator_to_array($this->childNodes), function ($child) { |
|
484 | - return $child instanceof DOMText; |
|
485 | - })) |
|
486 | - |
|
487 | - ); |
|
488 | - } |
|
489 | - |
|
490 | - /** |
|
491 | - * Determine if a node qualifies as phrasing content. |
|
492 | - * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content. |
|
493 | - * |
|
494 | - * @return bool |
|
495 | - */ |
|
496 | - public function isPhrasingContent() |
|
497 | - { |
|
498 | - return $this->nodeType === XML_TEXT_NODE || in_array($this->nodeName, $this->phrasing_elems) !== false || |
|
499 | - (!is_null($this->childNodes) && |
|
500 | - ($this->nodeName === 'a' || $this->nodeName === 'del' || $this->nodeName === 'ins') && |
|
501 | - array_reduce(iterator_to_array($this->childNodes), function ($carry, $node) { |
|
502 | - return $node->isPhrasingContent() && $carry; |
|
503 | - }, true) |
|
504 | - ); |
|
505 | - } |
|
506 | - |
|
507 | - /** |
|
508 | - * In the original JS project they check if the node has the style display=none, which unfortunately |
|
509 | - * in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none". |
|
510 | - * |
|
511 | - * Might be a good idea to check for classes or other attributes like 'aria-hidden' |
|
512 | - * |
|
513 | - * @return bool |
|
514 | - */ |
|
515 | - public function isProbablyVisible() |
|
516 | - { |
|
517 | - return !preg_match('/display:( )?none/', $this->getAttribute('style')) && !$this->hasAttribute('hidden'); |
|
518 | - } |
|
519 | - |
|
520 | - /** |
|
521 | - * @return bool |
|
522 | - */ |
|
523 | - public function isWhitespace() |
|
524 | - { |
|
525 | - return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) || |
|
526 | - ($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br'); |
|
527 | - } |
|
528 | - |
|
529 | - /** |
|
530 | - * This is a hack that overcomes the issue of node shifting when scanning and removing nodes. |
|
531 | - * |
|
532 | - * In the JS version of getElementsByTagName, if you remove a node it will not appear during the |
|
533 | - * foreach. This does not happen in PHP DOMDocument, because if you remove a node, it will still appear but as an |
|
534 | - * orphan node and will give an exception if you try to do anything with it. |
|
535 | - * |
|
536 | - * Shifting also occurs when converting parent nodes (like a P to a DIV), which in that case the found nodes are |
|
537 | - * removed from the foreach "pool" but the internal index of the foreach is not aware and skips over nodes that |
|
538 | - * never looped over. (index is at position 5, 2 nodes are removed, next one should be node 3, but the foreach tries |
|
539 | - * to access node 6) |
|
540 | - * |
|
541 | - * This function solves this by searching for the nodes on every loop and keeping track of the count differences. |
|
542 | - * Because on every loop we call getElementsByTagName again, this could cause a performance impact and should be |
|
543 | - * used only when the results of the search are going to be used to remove the nodes. |
|
544 | - * |
|
545 | - * @param string $tag |
|
546 | - * |
|
547 | - * @return \Generator |
|
548 | - */ |
|
549 | - public function shiftingAwareGetElementsByTagName($tag) |
|
550 | - { |
|
551 | - /** @var $nodes DOMNodeList */ |
|
552 | - $nodes = $this->getElementsByTagName($tag); |
|
553 | - $count = $nodes->length; |
|
554 | - |
|
555 | - for ($i = 0; $i < $count; $i = max(++$i, 0)) { |
|
556 | - yield $nodes->item($i); |
|
557 | - |
|
558 | - // Search for all the nodes again |
|
559 | - $nodes = $this->getElementsByTagName($tag); |
|
560 | - |
|
561 | - // Subtract the amount of nodes removed from the current index |
|
562 | - $i -= $count - $nodes->length; |
|
563 | - |
|
564 | - // Subtract the amount of nodes removed from the current count |
|
565 | - $count -= ($count - $nodes->length); |
|
566 | - } |
|
567 | - } |
|
568 | - |
|
569 | - /** |
|
570 | - * Mimics JS's firstElementChild property. PHP only has firstChild which could be any type of DOMNode. Use this |
|
571 | - * function to get the first one that is an DOMElement node. |
|
572 | - * |
|
573 | - * @return \DOMElement|null |
|
574 | - */ |
|
575 | - public function getFirstElementChild() |
|
576 | - { |
|
577 | - if ($this->childNodes instanceof \Traversable) { |
|
578 | - foreach ($this->childNodes as $node) { |
|
579 | - if ($node instanceof \DOMElement) { |
|
580 | - return $node; |
|
581 | - } |
|
582 | - } |
|
583 | - } |
|
584 | - |
|
585 | - return null; |
|
586 | - } |
|
483 | + + count(array_filter(iterator_to_array($this->childNodes), function ($child) { |
|
484 | + return $child instanceof DOMText; |
|
485 | + })) |
|
486 | + |
|
487 | + ); |
|
488 | + } |
|
489 | + |
|
490 | + /** |
|
491 | + * Determine if a node qualifies as phrasing content. |
|
492 | + * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content. |
|
493 | + * |
|
494 | + * @return bool |
|
495 | + */ |
|
496 | + public function isPhrasingContent() |
|
497 | + { |
|
498 | + return $this->nodeType === XML_TEXT_NODE || in_array($this->nodeName, $this->phrasing_elems) !== false || |
|
499 | + (!is_null($this->childNodes) && |
|
500 | + ($this->nodeName === 'a' || $this->nodeName === 'del' || $this->nodeName === 'ins') && |
|
501 | + array_reduce(iterator_to_array($this->childNodes), function ($carry, $node) { |
|
502 | + return $node->isPhrasingContent() && $carry; |
|
503 | + }, true) |
|
504 | + ); |
|
505 | + } |
|
506 | + |
|
507 | + /** |
|
508 | + * In the original JS project they check if the node has the style display=none, which unfortunately |
|
509 | + * in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none". |
|
510 | + * |
|
511 | + * Might be a good idea to check for classes or other attributes like 'aria-hidden' |
|
512 | + * |
|
513 | + * @return bool |
|
514 | + */ |
|
515 | + public function isProbablyVisible() |
|
516 | + { |
|
517 | + return !preg_match('/display:( )?none/', $this->getAttribute('style')) && !$this->hasAttribute('hidden'); |
|
518 | + } |
|
519 | + |
|
520 | + /** |
|
521 | + * @return bool |
|
522 | + */ |
|
523 | + public function isWhitespace() |
|
524 | + { |
|
525 | + return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) || |
|
526 | + ($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br'); |
|
527 | + } |
|
528 | + |
|
529 | + /** |
|
530 | + * This is a hack that overcomes the issue of node shifting when scanning and removing nodes. |
|
531 | + * |
|
532 | + * In the JS version of getElementsByTagName, if you remove a node it will not appear during the |
|
533 | + * foreach. This does not happen in PHP DOMDocument, because if you remove a node, it will still appear but as an |
|
534 | + * orphan node and will give an exception if you try to do anything with it. |
|
535 | + * |
|
536 | + * Shifting also occurs when converting parent nodes (like a P to a DIV), which in that case the found nodes are |
|
537 | + * removed from the foreach "pool" but the internal index of the foreach is not aware and skips over nodes that |
|
538 | + * never looped over. (index is at position 5, 2 nodes are removed, next one should be node 3, but the foreach tries |
|
539 | + * to access node 6) |
|
540 | + * |
|
541 | + * This function solves this by searching for the nodes on every loop and keeping track of the count differences. |
|
542 | + * Because on every loop we call getElementsByTagName again, this could cause a performance impact and should be |
|
543 | + * used only when the results of the search are going to be used to remove the nodes. |
|
544 | + * |
|
545 | + * @param string $tag |
|
546 | + * |
|
547 | + * @return \Generator |
|
548 | + */ |
|
549 | + public function shiftingAwareGetElementsByTagName($tag) |
|
550 | + { |
|
551 | + /** @var $nodes DOMNodeList */ |
|
552 | + $nodes = $this->getElementsByTagName($tag); |
|
553 | + $count = $nodes->length; |
|
554 | + |
|
555 | + for ($i = 0; $i < $count; $i = max(++$i, 0)) { |
|
556 | + yield $nodes->item($i); |
|
557 | + |
|
558 | + // Search for all the nodes again |
|
559 | + $nodes = $this->getElementsByTagName($tag); |
|
560 | + |
|
561 | + // Subtract the amount of nodes removed from the current index |
|
562 | + $i -= $count - $nodes->length; |
|
563 | + |
|
564 | + // Subtract the amount of nodes removed from the current count |
|
565 | + $count -= ($count - $nodes->length); |
|
566 | + } |
|
567 | + } |
|
568 | + |
|
569 | + /** |
|
570 | + * Mimics JS's firstElementChild property. PHP only has firstChild which could be any type of DOMNode. Use this |
|
571 | + * function to get the first one that is an DOMElement node. |
|
572 | + * |
|
573 | + * @return \DOMElement|null |
|
574 | + */ |
|
575 | + public function getFirstElementChild() |
|
576 | + { |
|
577 | + if ($this->childNodes instanceof \Traversable) { |
|
578 | + foreach ($this->childNodes as $node) { |
|
579 | + if ($node instanceof \DOMElement) { |
|
580 | + return $node; |
|
581 | + } |
|
582 | + } |
|
583 | + } |
|
584 | + |
|
585 | + return null; |
|
586 | + } |
|
587 | 587 | } |