Total Complexity | 99 |
Total Lines | 572 |
Duplicated Lines | 0 % |
Changes | 0 |
Complex classes like NodeTrait often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use NodeTrait, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
14 | trait NodeTrait |
||
15 | { |
||
16 | /** |
||
17 | * Content score of the node. Used to determine the value of the content. |
||
18 | * |
||
19 | * @var int |
||
20 | */ |
||
21 | public $contentScore = 0; |
||
22 | |||
23 | /** |
||
24 | * Flag for initialized status. |
||
25 | * |
||
26 | * @var bool |
||
27 | */ |
||
28 | private $initialized = false; |
||
29 | |||
30 | /** |
||
31 | * Flag data tables. |
||
32 | * |
||
33 | * @var bool |
||
34 | */ |
||
35 | private $readabilityDataTable = false; |
||
36 | |||
37 | /** |
||
38 | * @var array |
||
39 | */ |
||
40 | private $divToPElements = [ |
||
41 | 'a', |
||
42 | 'blockquote', |
||
43 | 'dl', |
||
44 | 'div', |
||
45 | 'img', |
||
46 | 'ol', |
||
47 | 'p', |
||
48 | 'pre', |
||
49 | 'table', |
||
50 | 'ul', |
||
51 | 'select', |
||
52 | ]; |
||
53 | |||
54 | /** |
||
55 | * The commented out elements qualify as phrasing content but tend to be |
||
56 | * removed by readability when put into paragraphs, so we ignore them here. |
||
57 | * |
||
58 | * @var array |
||
59 | */ |
||
60 | private $phrasing_elems = [ |
||
61 | // 'CANVAS', 'IFRAME', 'SVG', 'VIDEO', |
||
62 | 'abbr', 'audio', 'b', 'bdo', 'br', 'button', 'cite', 'code', 'data', |
||
63 | 'datalist', 'dfn', 'em', 'embed', 'i', 'img', 'input', 'kbd', 'label', |
||
64 | 'mark', 'math', 'meter', 'noscript', 'object', 'output', 'progress', 'q', |
||
65 | 'ruby', 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', |
||
66 | 'sup', 'textarea', 'time', 'var', 'wbr' |
||
67 | ]; |
||
68 | |||
69 | /** |
||
70 | * initialized getter. |
||
71 | * |
||
72 | * @return bool |
||
73 | */ |
||
74 | public function isInitialized() |
||
75 | { |
||
76 | return $this->initialized; |
||
77 | } |
||
78 | |||
79 | /** |
||
80 | * @return bool |
||
81 | */ |
||
82 | public function isReadabilityDataTable() |
||
96 | // return $this->readabilityDataTable; |
||
97 | } |
||
98 | |||
99 | /** |
||
100 | * @param bool $param |
||
101 | */ |
||
102 | public function setReadabilityDataTable($param) |
||
103 | { |
||
104 | // Can't be "true" because DOMDocument casts it to "1" |
||
105 | $this->setAttribute('readabilityDataTable', $param ? '1' : '0'); |
||
|
|||
106 | // $this->readabilityDataTable = $param; |
||
107 | } |
||
108 | |||
109 | /** |
||
110 | * Initializer. Calculates the current score of the node and returns a full Readability object. |
||
111 | * |
||
112 | * @ TODO: I don't like the weightClasses param. How can we get the config here? |
||
113 | * |
||
114 | * @param $weightClasses bool Weight classes? |
||
115 | * |
||
116 | * @return static |
||
117 | */ |
||
118 | public function initializeNode($weightClasses) |
||
119 | { |
||
120 | if (!$this->isInitialized()) { |
||
121 | $contentScore = 0; |
||
122 | |||
123 | switch ($this->nodeName) { |
||
124 | case 'div': |
||
125 | $contentScore += 5; |
||
126 | break; |
||
127 | |||
128 | case 'pre': |
||
129 | case 'td': |
||
130 | case 'blockquote': |
||
131 | $contentScore += 3; |
||
132 | break; |
||
133 | |||
134 | case 'address': |
||
135 | case 'ol': |
||
136 | case 'ul': |
||
137 | case 'dl': |
||
138 | case 'dd': |
||
139 | case 'dt': |
||
140 | case 'li': |
||
141 | case 'form': |
||
142 | $contentScore -= 3; |
||
143 | break; |
||
144 | |||
145 | case 'h1': |
||
146 | case 'h2': |
||
147 | case 'h3': |
||
148 | case 'h4': |
||
149 | case 'h5': |
||
150 | case 'h6': |
||
151 | case 'th': |
||
152 | $contentScore -= 5; |
||
153 | break; |
||
154 | } |
||
155 | |||
156 | $this->contentScore = $contentScore + ($weightClasses ? $this->getClassWeight() : 0); |
||
157 | |||
158 | $this->initialized = true; |
||
159 | } |
||
160 | |||
161 | return $this; |
||
162 | } |
||
163 | |||
164 | /** |
||
165 | * Override for native getAttribute method. Some nodes have the getAttribute method, some don't, so we need |
||
166 | * to check first the existence of the attributes property. |
||
167 | * |
||
168 | * @param $attributeName string Attribute to retrieve |
||
169 | * |
||
170 | * @return string |
||
171 | */ |
||
172 | public function getAttribute($attributeName) |
||
173 | { |
||
174 | if (!is_null($this->attributes)) { |
||
175 | return parent::getAttribute($attributeName); |
||
176 | } |
||
177 | |||
178 | return ''; |
||
179 | } |
||
180 | |||
181 | /** |
||
182 | * Override for native hasAttribute. |
||
183 | * |
||
184 | * @param $attributeName |
||
185 | * |
||
186 | * @return bool |
||
187 | * |
||
188 | * @see getAttribute |
||
189 | */ |
||
190 | public function hasAttribute($attributeName) |
||
191 | { |
||
192 | if (!is_null($this->attributes)) { |
||
193 | return parent::hasAttribute($attributeName); |
||
194 | } |
||
195 | |||
196 | return false; |
||
197 | } |
||
198 | |||
199 | /** |
||
200 | * Get the ancestors of the current node. |
||
201 | * |
||
202 | * @param int|bool $maxLevel Max amount of ancestors to get. False for all of them |
||
203 | * |
||
204 | * @return array |
||
205 | */ |
||
206 | public function getNodeAncestors($maxLevel = 3) |
||
223 | } |
||
224 | |||
225 | /** |
||
226 | * Returns all links from the current element. |
||
227 | * |
||
228 | * @return array |
||
229 | */ |
||
230 | public function getAllLinks() |
||
231 | { |
||
232 | return iterator_to_array($this->getElementsByTagName('a')); |
||
233 | } |
||
234 | |||
235 | /** |
||
236 | * Get the density of links as a percentage of the content |
||
237 | * This is the amount of text that is inside a link divided by the total text in the node. |
||
238 | * |
||
239 | * @return int |
||
240 | */ |
||
241 | public function getLinkDensity() |
||
242 | { |
||
243 | $linkLength = 0; |
||
244 | $textLength = mb_strlen($this->getTextContent(true)); |
||
245 | |||
246 | if (!$textLength) { |
||
247 | return 0; |
||
248 | } |
||
249 | |||
250 | $links = $this->getAllLinks(); |
||
251 | |||
252 | if ($links) { |
||
253 | /** @var DOMElement $link */ |
||
254 | foreach ($links as $link) { |
||
255 | $linkLength += mb_strlen($link->getTextContent(true)); |
||
256 | } |
||
257 | } |
||
258 | |||
259 | return $linkLength / $textLength; |
||
260 | } |
||
261 | |||
262 | /** |
||
263 | * Calculates the weight of the class/id of the current element. |
||
264 | * |
||
265 | * @return int |
||
266 | */ |
||
267 | public function getClassWeight() |
||
268 | { |
||
269 | $weight = 0; |
||
270 | |||
271 | // Look for a special classname |
||
272 | $class = $this->getAttribute('class'); |
||
273 | if (trim($class)) { |
||
274 | if (preg_match(NodeUtility::$regexps['negative'], $class)) { |
||
275 | $weight -= 25; |
||
276 | } |
||
277 | |||
278 | if (preg_match(NodeUtility::$regexps['positive'], $class)) { |
||
279 | $weight += 25; |
||
280 | } |
||
281 | } |
||
282 | |||
283 | // Look for a special ID |
||
284 | $id = $this->getAttribute('id'); |
||
285 | if (trim($id)) { |
||
286 | if (preg_match(NodeUtility::$regexps['negative'], $id)) { |
||
287 | $weight -= 25; |
||
288 | } |
||
289 | |||
290 | if (preg_match(NodeUtility::$regexps['positive'], $id)) { |
||
291 | $weight += 25; |
||
292 | } |
||
293 | } |
||
294 | |||
295 | return $weight; |
||
296 | } |
||
297 | |||
298 | /** |
||
299 | * Returns the full text of the node. |
||
300 | * |
||
301 | * @param bool $normalize Normalize white space? |
||
302 | * |
||
303 | * @return string |
||
304 | */ |
||
305 | public function getTextContent($normalize = false) |
||
306 | { |
||
307 | $nodeValue = $this->nodeValue; |
||
308 | if ($normalize) { |
||
309 | $nodeValue = trim(preg_replace('/\s{2,}/', ' ', $nodeValue)); |
||
310 | } |
||
311 | |||
312 | return $nodeValue; |
||
313 | } |
||
314 | |||
315 | /** |
||
316 | * Returns the children of the current node. |
||
317 | * |
||
318 | * @param bool $filterEmptyDOMText Filter empty DOMText nodes? |
||
319 | * |
||
320 | * @deprecated Use NodeUtility::filterTextNodes, function will be removed in version 3.0 |
||
321 | * |
||
322 | * @return array |
||
323 | */ |
||
324 | public function getChildren($filterEmptyDOMText = false) |
||
325 | { |
||
326 | @trigger_error('getChildren was replaced with NodeUtility::filterTextNodes and will be removed in version 3.0', E_USER_DEPRECATED); |
||
327 | |||
328 | $ret = iterator_to_array($this->childNodes); |
||
329 | if ($filterEmptyDOMText) { |
||
330 | // Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number |
||
331 | $ret = array_values(array_filter($ret, function($node) { |
||
332 | return $node->nodeName !== '#text' || mb_strlen(trim($node->nodeValue)); |
||
333 | })); |
||
334 | } |
||
335 | |||
336 | return $ret; |
||
337 | } |
||
338 | |||
339 | /** |
||
340 | * Return an array indicating how many rows and columns this table has. |
||
341 | * |
||
342 | * @return array |
||
343 | */ |
||
344 | public function getRowAndColumnCount() |
||
345 | { |
||
346 | $rows = $columns = 0; |
||
347 | $trs = $this->getElementsByTagName('tr'); |
||
348 | foreach ($trs as $tr) { |
||
349 | /** @var \DOMElement $tr */ |
||
350 | $rowspan = $tr->getAttribute('rowspan'); |
||
351 | $rows += ($rowspan || 1); |
||
352 | |||
353 | // Now look for column-related info |
||
354 | $columnsInThisRow = 0; |
||
355 | $cells = $tr->getElementsByTagName('td'); |
||
356 | foreach ($cells as $cell) { |
||
357 | /** @var \DOMElement $cell */ |
||
358 | $colspan = $cell->getAttribute('colspan'); |
||
359 | $columnsInThisRow += ($colspan || 1); |
||
360 | } |
||
361 | $columns = max($columns, $columnsInThisRow); |
||
362 | } |
||
363 | |||
364 | return ['rows' => $rows, 'columns' => $columns]; |
||
365 | } |
||
366 | |||
367 | /** |
||
368 | * Creates a new node based on the text content of the original node. |
||
369 | * |
||
370 | * @param $originalNode DOMNode |
||
371 | * @param $tagName string |
||
372 | * |
||
373 | * @return DOMElement |
||
374 | */ |
||
375 | public function createNode($originalNode, $tagName) |
||
381 | } |
||
382 | |||
383 | /** |
||
384 | * Check if a given node has one of its ancestor tag name matching the |
||
385 | * provided one. |
||
386 | * |
||
387 | * @param string $tagName |
||
388 | * @param int $maxDepth |
||
389 | * @param callable $filterFn |
||
390 | * |
||
391 | * @return bool |
||
392 | */ |
||
393 | public function hasAncestorTag($tagName, $maxDepth = 3, callable $filterFn = null) |
||
394 | { |
||
395 | $depth = 0; |
||
396 | $node = $this; |
||
397 | |||
398 | while ($node->parentNode) { |
||
399 | if ($maxDepth > 0 && $depth > $maxDepth) { |
||
400 | return false; |
||
401 | } |
||
402 | |||
403 | if ($node->parentNode->nodeName === $tagName && (!$filterFn || $filterFn($node->parentNode))) { |
||
404 | return true; |
||
405 | } |
||
406 | |||
407 | $node = $node->parentNode; |
||
408 | $depth++; |
||
409 | } |
||
410 | |||
411 | return false; |
||
412 | } |
||
413 | |||
414 | /** |
||
415 | * Check if this node has only whitespace and a single element with given tag |
||
416 | * or if it contains no element with given tag or more than 1 element. |
||
417 | * |
||
418 | * @param $tag string Name of tag |
||
419 | * |
||
420 | * @return bool |
||
421 | */ |
||
422 | public function hasSingleTagInsideElement($tag) |
||
437 | }); |
||
438 | } |
||
439 | |||
440 | /** |
||
441 | * Check if the current element has a single child block element. |
||
442 | * Block elements are the ones defined in the divToPElements array. |
||
443 | * |
||
444 | * @return bool |
||
445 | */ |
||
446 | public function hasSingleChildBlockElement() |
||
447 | { |
||
448 | $result = false; |
||
449 | if ($this->hasChildNodes()) { |
||
450 | foreach ($this->childNodes as $child) { |
||
451 | if (in_array($child->nodeName, $this->divToPElements)) { |
||
452 | $result = true; |
||
453 | } else { |
||
454 | // If any of the hasSingleChildBlockElement calls return true, return true then. |
||
455 | /** @var $child DOMElement */ |
||
456 | $result = ($result || $child->hasSingleChildBlockElement()); |
||
457 | } |
||
458 | } |
||
459 | } |
||
460 | |||
461 | return $result; |
||
462 | } |
||
463 | |||
464 | /** |
||
465 | * Determines if a node has no content or it is just a bunch of dividing lines and/or whitespace. |
||
466 | * |
||
467 | * @return bool |
||
468 | */ |
||
469 | public function isElementWithoutContent() |
||
470 | { |
||
471 | return $this instanceof DOMElement && |
||
472 | mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $this->textContent)) === 0 && |
||
473 | ($this->childNodes->length === 0 || |
||
474 | $this->childNodes->length === $this->getElementsByTagName('br')->length + $this->getElementsByTagName('hr')->length |
||
475 | /* |
||
476 | * Special PHP DOMDocument case: We also need to count how many DOMText we have inside the node. |
||
477 | * If there's an empty tag with an space inside and a BR (for example "<p> <br/></p>) counting only BRs and |
||
478 | * HRs will will say that the example has 2 nodes, instead of one. This happens because in DOMDocument, |
||
479 | * DOMTexts are also nodes (which doesn't happen in JS). So we need to also count how many DOMText we |
||
480 | * are dealing with (And at this point we know they are empty or are just whitespace, because of the |
||
481 | * mb_strlen in this chain of checks). |
||
482 | */ |
||
483 | + count(array_filter(iterator_to_array($this->childNodes), function($child) { |
||
484 | return $child instanceof DOMText; |
||
485 | })) |
||
486 | |||
487 | ); |
||
488 | } |
||
489 | |||
490 | /** |
||
491 | * Determine if a node qualifies as phrasing content. |
||
492 | * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content. |
||
493 | * |
||
494 | * @return bool |
||
495 | */ |
||
496 | public function isPhrasingContent() |
||
504 | ); |
||
505 | } |
||
506 | |||
507 | /** |
||
508 | * In the original JS project they check if the node has the style display=none, which unfortunately |
||
509 | * in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none". |
||
510 | * |
||
511 | * Might be a good idea to check for classes or other attributes like 'aria-hidden' |
||
512 | * |
||
513 | * @return bool |
||
514 | */ |
||
515 | public function isProbablyVisible() |
||
518 | } |
||
519 | |||
520 | /** |
||
521 | * @return bool |
||
522 | */ |
||
523 | public function isWhitespace() |
||
524 | { |
||
525 | return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) || |
||
526 | ($this->nodeType === XML_ELEMENT_NODE && $this->nodeName === 'br'); |
||
527 | } |
||
528 | |||
529 | /** |
||
530 | * This is a hack that overcomes the issue of node shifting when scanning and removing nodes. |
||
531 | * |
||
532 | * In the JS version of getElementsByTagName, if you remove a node it will not appear during the |
||
533 | * foreach. This does not happen in PHP DOMDocument, because if you remove a node, it will still appear but as an |
||
534 | * orphan node and will give an exception if you try to do anything with it. |
||
535 | * |
||
536 | * Shifting also occurs when converting parent nodes (like a P to a DIV), which in that case the found nodes are |
||
537 | * removed from the foreach "pool" but the internal index of the foreach is not aware and skips over nodes that |
||
538 | * never looped over. (index is at position 5, 2 nodes are removed, next one should be node 3, but the foreach tries |
||
539 | * to access node 6) |
||
540 | * |
||
541 | * This function solves this by searching for the nodes on every loop and keeping track of the count differences. |
||
542 | * Because on every loop we call getElementsByTagName again, this could cause a performance impact and should be |
||
543 | * used only when the results of the search are going to be used to remove the nodes. |
||
544 | * |
||
545 | * @param string $tag |
||
546 | * |
||
547 | * @return \Generator |
||
548 | */ |
||
549 | public function shiftingAwareGetElementsByTagName($tag) |
||
566 | } |
||
567 | } |
||
568 | |||
569 | /** |
||
570 | * Mimics JS's firstElementChild property. PHP only has firstChild which could be any type of DOMNode. Use this |
||
571 | * function to get the first one that is an DOMElement node. |
||
572 | * |
||
573 | * @return \DOMElement|null |
||
574 | */ |
||
575 | public function getFirstElementChild() |
||
586 | } |
||
587 | } |
||
588 |