ContentExtractor::updateBestNodeCandidates() - Code Metrics - Inspection of "Merge branch 'master' of github.com:scotteh/php-go..." - scotteh/php-goose - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 9c6482...84b1b9 )

by Andrew

created 2017-11-15 10:12 UTC

ContentExtractor::updateBestNodeCandidates() B

↳ Parent: ContentExtractor

Complexity

Conditions	6
Paths	12

Size

Total Lines	17
Code Lines	9

Duplication

Lines	10
Ratio	58.82 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
dl	10
loc	17
rs	8.8571
c	1
b	0
f	0
cc	6
eloc	9
nc	12
nop	2

<?php

namespace Goose\Modules\Extractors;

use Goose\Article;
use Goose\Traits\NodeCommonTrait;
use Goose\Traits\NodeGravityTrait;
use Goose\Traits\ArticleMutatorTrait;
use Goose\Modules\AbstractModule;
use Goose\Modules\ModuleInterface;
use DOMWrap\Element;

/**
 * Content Extractor
 *
 * @package Goose\Modules\Extractors
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
 */
class ContentExtractor extends AbstractModule implements ModuleInterface {
    use ArticleMutatorTrait, NodeGravityTrait, NodeCommonTrait;

    /**
     * @param Article $article
     */
    public function run(Article $article) {
        $this->article($article);

        $article->setTopNode($this->getTopNode());
    }

    /**
     * @param Article $article
     *
     * @return array
     */
    private function getTopNodeCandidatesByContents(Article $article) {
        $results = [];

        $nodes = $article->getDoc()->find('p, td, pre');

        foreach ($nodes as $node) {
            $wordStats = $this->config()->getStopWords()->getStopwordCount($node->text());
            $highLinkDensity = $this->isHighLinkDensity($node);

            if ($wordStats->getStopWordCount() > 2 && !$highLinkDensity) {
                $results[] = $node;
            }
        }

        return $results;
    }

    /**
     * @param Element $node
     * @param int $i
     * @param int $totalNodes
     *
     * @return double
     */
    private function getTopNodeCandidateScore(Element $node, $i, $totalNodes) {
        $boostScore = (1.0 / ($i + 1)) * 50;
        $bottomNodesForNegativeScore = $totalNodes * 0.25;

        if ($totalNodes > 15) {
            if ($totalNodes - $i <= $bottomNodesForNegativeScore) {
                $booster = $bottomNodesForNegativeScore - ($totalNodes - $i);
                $boostScore = pow($booster, 2) * -1;
                $negscore = abs($boostScore);
                if ($negscore > 40) {
                    $boostScore = 5;
                }
            }
        }

        $wordStats = $this->config()->getStopWords()->getStopwordCount($node->text());
        $upscore = $wordStats->getStopWordCount() + $boostScore;

        return $upscore;
    }

    /**
     * @param array $nodes
     *
     * @return Element|null
     */
    private function getTopNodeByScore($nodes) {
        $topNode = null;
        $topNodeScore = 0;

        foreach ($nodes as $node) {
            $score = $this->getScore($node);

            if ($score > $topNodeScore) {
                $topNode = $node;
                $topNodeScore = $score;
            }

            if ($topNode === false) {
                $topNode = $node;
            }
        }

        if ($topNode && $this->getScore($topNode) < 20) {
            return null;
        }

        return $topNode;
    }

    /**
     * @param Element $node
     * @param double $upscore
     */
    private function calculateBestNodeCandidateScores(Element $node, $upscore) {
        if ($node->parent() instanceof Element) {
            $this->updateScore($node->parent(), $upscore);
            $this->updateNodeCount($node->parent(), 1);

            if ($node->parent()->parent() instanceof Element) {
                $this->updateScore($node->parent()->parent(), $upscore / 2);
                $this->updateNodeCount($node->parent()->parent(), 1);
            }
        }
    }

    /**
     * @param Element $node
     * @param array $nodeCandidates
     */
    private function updateBestNodeCandidates(Element $node, $nodeCandidates) {
        if (!in_array($node->parent(), $nodeCandidates, true)) {

            if ($node->parent() instanceof Element) {
                $nodeCandidates[] = $node->parent();
            }
        }

        if ($node->parent() instanceof Element) {
            if (!in_array($node->parent()->parent(), $nodeCandidates, true)) {

                if ($node->parent()->parent() instanceof Element) {
                    $nodeCandidates[] = $node->parent()->parent();
                }
            }
        }

        return $nodeCandidates;
    }

    /**
     * We're going to start looking for where the clusters of paragraphs are. We'll score a cluster based on the number of stopwords
     * and the number of consecutive paragraphs together, which should form the cluster of text that this node is around
     * also store on how high up the paragraphs are, comments are usually at the bottom and should get a lower score
     *
     * @return Element|null
     */
    public function getTopNode() {
        $nodes = $this->getTopNodeCandidatesByContents($this->article());

        $nodeCandidates = [];

        $i = 0;
        foreach ($nodes as $node) {
            if ($this->isOkToBoost($node)) {
                $upscore = $this->getTopNodeCandidateScore($node, $i, count($nodes));

                $this->calculateBestNodeCandidateScores($node, $upscore);
                $nodeCandidates = $this->updateBestNodeCandidates($node, $nodeCandidates);

                $i++;
            }
        }

        return $this->getTopNodeByScore($nodeCandidates);
    }

    /**
     * A lot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to
     * boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs
     * so we'll want to make sure that the next sibling is a paragraph and has at least some substantial weight to it
     *
     * @param Element $node
     *
     * @return bool
     */
    private function isOkToBoost(Element $node) {
        $stepsAway = 0;
        $minimumStopWordCount = 5;
        $maxStepsAwayFromNode = 3;

        // Find all previous sibling element nodes
        $siblings = $node->precedingAll(function($node) {
            return $node instanceof Element;
        });

        foreach ($siblings as $sibling) {
            if ($sibling->is('p, strong')) {
                if ($stepsAway >= $maxStepsAwayFromNode) {
                    return false;
                }

                $wordStats = $this->config()->getStopWords()->getStopwordCount($sibling->text());

                if ($wordStats->getStopWordCount() > $minimumStopWordCount) {
                    return true;
                }

                $stepsAway += 1;
            }
        }

        return false;
    }
}


1		<?php
2
3		namespace Goose\Modules\Extractors;
4
5		use Goose\Article;
6		use Goose\Traits\NodeCommonTrait;
7		use Goose\Traits\NodeGravityTrait;
8		use Goose\Traits\ArticleMutatorTrait;
9		use Goose\Modules\AbstractModule;
10		use Goose\Modules\ModuleInterface;
11		use DOMWrap\Element;
12
13		/**
14		* Content Extractor
15		*
16		* @package Goose\Modules\Extractors
17		* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
18		*/
19		class ContentExtractor extends AbstractModule implements ModuleInterface {
20		use ArticleMutatorTrait, NodeGravityTrait, NodeCommonTrait;
21
22		/**
23		* @param Article $article
24		*/
25		public function run(Article $article) {
26		$this->article($article);
27
28		$article->setTopNode($this->getTopNode());
29		}
30
31		/**
32		* @param Article $article
33		*
34		* @return array
35		*/
36		private function getTopNodeCandidatesByContents(Article $article) {
37		$results = [];
38
39		$nodes = $article->getDoc()->find('p, td, pre');
40
41		foreach ($nodes as $node) {
42		$wordStats = $this->config()->getStopWords()->getStopwordCount($node->text());
43		$highLinkDensity = $this->isHighLinkDensity($node);
44
45		if ($wordStats->getStopWordCount() > 2 && !$highLinkDensity) {
46		$results[] = $node;
47		}
48		}
49
50		return $results;
51		}
52
53		/**
54		* @param Element $node
55		* @param int $i
56		* @param int $totalNodes
57		*
58		* @return double
59		*/
60		private function getTopNodeCandidateScore(Element $node, $i, $totalNodes) {
61		$boostScore = (1.0 / ($i + 1)) * 50;
62		$bottomNodesForNegativeScore = $totalNodes * 0.25;
63
64		if ($totalNodes > 15) {
65		if ($totalNodes - $i <= $bottomNodesForNegativeScore) {
66		$booster = $bottomNodesForNegativeScore - ($totalNodes - $i);
67		$boostScore = pow($booster, 2) * -1;
68		$negscore = abs($boostScore);
69		if ($negscore > 40) {
70		$boostScore = 5;
71		}
72		}
73		}
74
75		$wordStats = $this->config()->getStopWords()->getStopwordCount($node->text());
76		$upscore = $wordStats->getStopWordCount() + $boostScore;
77
78		return $upscore;
79		}
80
81		/**
82		* @param array $nodes
83		*
84		* @return Element\|null
85		*/
86		private function getTopNodeByScore($nodes) {
87		$topNode = null;
88		$topNodeScore = 0;
89
90		foreach ($nodes as $node) {
91		$score = $this->getScore($node);
92
93		if ($score > $topNodeScore) {
94		$topNode = $node;
95		$topNodeScore = $score;
96		}
97
98		if ($topNode === false) {
99		$topNode = $node;
100		}
101		}
102
103		if ($topNode && $this->getScore($topNode) < 20) {
104		return null;
105		}
106
107		return $topNode;
108		}
109
110		/**
111		* @param Element $node
112		* @param double $upscore
113		*/
114		private function calculateBestNodeCandidateScores(Element $node, $upscore) {
115		if ($node->parent() instanceof Element) {
116		$this->updateScore($node->parent(), $upscore);
117		$this->updateNodeCount($node->parent(), 1);
118
119		if ($node->parent()->parent() instanceof Element) {
120		$this->updateScore($node->parent()->parent(), $upscore / 2);
121		$this->updateNodeCount($node->parent()->parent(), 1);
122		}
123		}
124		}
125
126		/**
127		* @param Element $node
128		* @param array $nodeCandidates
129		*/
130		private function updateBestNodeCandidates(Element $node, $nodeCandidates) {
131	View Code Duplication	if (!in_array($node->parent(), $nodeCandidates, true)) {
		0 ignored issues – show Duplication introduced 2017-11-15 10:13 UTC by Report Bug Copy Issue Report This code seems to be duplicated across your project. Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. Loading history...
132		if ($node->parent() instanceof Element) {
133		$nodeCandidates[] = $node->parent();
134		}
135		}
136
137		if ($node->parent() instanceof Element) {
138	View Code Duplication	if (!in_array($node->parent()->parent(), $nodeCandidates, true)) {
		0 ignored issues – show Duplication introduced 2017-11-15 10:13 UTC by Report Bug Copy Issue Report This code seems to be duplicated across your project. Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository. Loading history...
139		if ($node->parent()->parent() instanceof Element) {
140		$nodeCandidates[] = $node->parent()->parent();
141		}
142		}
143		}
144
145		return $nodeCandidates;
146		}
147
148		/**
149		* We're going to start looking for where the clusters of paragraphs are. We'll score a cluster based on the number of stopwords
150		* and the number of consecutive paragraphs together, which should form the cluster of text that this node is around
151		* also store on how high up the paragraphs are, comments are usually at the bottom and should get a lower score
152		*
153		* @return Element\|null
154		*/
155		public function getTopNode() {
156		$nodes = $this->getTopNodeCandidatesByContents($this->article());
157
158		$nodeCandidates = [];
159
160		$i = 0;
161		foreach ($nodes as $node) {
162		if ($this->isOkToBoost($node)) {
163		$upscore = $this->getTopNodeCandidateScore($node, $i, count($nodes));
164
165		$this->calculateBestNodeCandidateScores($node, $upscore);
166		$nodeCandidates = $this->updateBestNodeCandidates($node, $nodeCandidates);
167
168		$i++;
169		}
170		}
171
172		return $this->getTopNodeByScore($nodeCandidates);
173		}
174
175		/**
176		* A lot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to
177		* boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs
178		* so we'll want to make sure that the next sibling is a paragraph and has at least some substantial weight to it
179		*
180		* @param Element $node
181		*
182		* @return bool
183		*/
184		private function isOkToBoost(Element $node) {
185		$stepsAway = 0;
186		$minimumStopWordCount = 5;
187		$maxStepsAwayFromNode = 3;
188
189		// Find all previous sibling element nodes
190		$siblings = $node->precedingAll(function($node) {
191		return $node instanceof Element;
192		});
193
194		foreach ($siblings as $sibling) {
195		if ($sibling->is('p, strong')) {
196		if ($stepsAway >= $maxStepsAwayFromNode) {
197		return false;
198		}
199
200		$wordStats = $this->config()->getStopWords()->getStopwordCount($sibling->text());
201
202		if ($wordStats->getStopWordCount() > $minimumStopWordCount) {
203		return true;
204		}
205
206		$stepsAway += 1;
207		}
208		}
209
210		return false;
211		}
212		}
213

scotteh / php-goose

Push — master ( 9c6482...84b1b9 )

ContentExtractor::updateBestNodeCandidates() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like