scotteh /
php-goose
| 1 | <?php declare(strict_types=1); |
||
| 2 | |||
| 3 | namespace Goose\Traits; |
||
| 4 | |||
| 5 | use Goose\Utils\Helper; |
||
| 6 | use DOMWrap\Element; |
||
| 7 | |||
| 8 | /** |
||
| 9 | * Node Common Trait |
||
| 10 | * |
||
| 11 | * @package Goose\Traits |
||
| 12 | * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0 |
||
| 13 | */ |
||
| 14 | trait NodeCommonTrait { |
||
| 15 | /** |
||
| 16 | * Checks the density of links within a node, is there not much text and most of it contains linky shit? |
||
| 17 | * if so it's no good |
||
| 18 | * |
||
| 19 | * @param Element $node |
||
| 20 | * @param float $limit |
||
| 21 | * |
||
| 22 | * @return bool |
||
| 23 | */ |
||
| 24 | private function isHighLinkDensity(Element $node, float $limit = 1.0): bool { |
||
| 25 | $links = $node->find('a, [onclick]'); |
||
| 26 | |||
| 27 | if ($links->count() == 0) { |
||
| 28 | return false; |
||
| 29 | } |
||
| 30 | |||
| 31 | $words = preg_split('@[\s]+@iu', $node->text(), -1, PREG_SPLIT_NO_EMPTY); |
||
| 32 | |||
| 33 | if (!is_array($words) || empty($words)) { |
||
|
0 ignored issues
–
show
introduced
by
Loading history...
|
|||
| 34 | return false; |
||
| 35 | } |
||
| 36 | |||
| 37 | $sb = []; |
||
| 38 | foreach ($links as $link) { |
||
| 39 | $sb[] = Helper::textNormalise($link->text()); |
||
| 40 | } |
||
| 41 | |||
| 42 | $linkText = implode('', $sb); |
||
| 43 | $linkWords = explode(' ', $linkText); |
||
| 44 | $numberOfLinkWords = count($linkWords); |
||
| 45 | $numberOfLinks = $links->count(); |
||
| 46 | $linkDivisor = $numberOfLinkWords / count($words); |
||
| 47 | $score = $linkDivisor * $numberOfLinks; |
||
| 48 | |||
| 49 | if ($score >= $limit) { |
||
| 50 | return true; |
||
| 51 | } |
||
| 52 | |||
| 53 | return false; |
||
| 54 | } |
||
| 55 | } |
||
| 56 |