Completed
Push — master ( 9c6482...84b1b9 )
by Andrew
01:51
created

ContentExtractor::updateBestNodeCandidates()   B

Complexity

Conditions 6
Paths 12

Size

Total Lines 17
Code Lines 9

Duplication

Lines 10
Ratio 58.82 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
dl 10
loc 17
rs 8.8571
c 1
b 0
f 0
cc 6
eloc 9
nc 12
nop 2
1
<?php
2
3
namespace Goose\Modules\Extractors;
4
5
use Goose\Article;
6
use Goose\Traits\NodeCommonTrait;
7
use Goose\Traits\NodeGravityTrait;
8
use Goose\Traits\ArticleMutatorTrait;
9
use Goose\Modules\AbstractModule;
10
use Goose\Modules\ModuleInterface;
11
use DOMWrap\Element;
12
13
/**
14
 * Content Extractor
15
 *
16
 * @package Goose\Modules\Extractors
17
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
18
 */
19
class ContentExtractor extends AbstractModule implements ModuleInterface {
20
    use ArticleMutatorTrait, NodeGravityTrait, NodeCommonTrait;
21
22
    /**
23
     * @param Article $article
24
     */
25
    public function run(Article $article) {
26
        $this->article($article);
27
28
        $article->setTopNode($this->getTopNode());
29
    }
30
31
    /**
32
     * @param Article $article
33
     *
34
     * @return array
35
     */
36
    private function getTopNodeCandidatesByContents(Article $article) {
37
        $results = [];
38
39
        $nodes = $article->getDoc()->find('p, td, pre');
40
41
        foreach ($nodes as $node) {
42
            $wordStats = $this->config()->getStopWords()->getStopwordCount($node->text());
43
            $highLinkDensity = $this->isHighLinkDensity($node);
44
45
            if ($wordStats->getStopWordCount() > 2 && !$highLinkDensity) {
46
                $results[] = $node;
47
            }
48
        }
49
50
        return $results;
51
    }
52
53
    /**
54
     * @param Element $node
55
     * @param int $i
56
     * @param int $totalNodes
57
     *
58
     * @return double
59
     */
60
    private function getTopNodeCandidateScore(Element $node, $i, $totalNodes) {
61
        $boostScore = (1.0 / ($i + 1)) * 50;
62
        $bottomNodesForNegativeScore = $totalNodes * 0.25;
63
64
        if ($totalNodes > 15) {
65
            if ($totalNodes - $i <= $bottomNodesForNegativeScore) {
66
                $booster = $bottomNodesForNegativeScore - ($totalNodes - $i);
67
                $boostScore = pow($booster, 2) * -1;
68
                $negscore = abs($boostScore);
69
                if ($negscore > 40) {
70
                    $boostScore = 5;
71
                }
72
            }
73
        }
74
75
        $wordStats = $this->config()->getStopWords()->getStopwordCount($node->text());
76
        $upscore = $wordStats->getStopWordCount() + $boostScore;
77
78
        return $upscore;
79
    }
80
81
    /**
82
     * @param array $nodes
83
     *
84
     * @return Element|null
85
     */
86
    private function getTopNodeByScore($nodes) {
87
        $topNode = null;
88
        $topNodeScore = 0;
89
90
        foreach ($nodes as $node) {
91
            $score = $this->getScore($node);
92
93
            if ($score > $topNodeScore) {
94
                $topNode = $node;
95
                $topNodeScore = $score;
96
            }
97
98
            if ($topNode === false) {
99
                $topNode = $node;
100
            }
101
        }
102
103
        if ($topNode && $this->getScore($topNode) < 20) {
104
            return null;
105
        }
106
107
        return $topNode;
108
    }
109
110
    /**
111
     * @param Element $node
112
     * @param double $upscore
113
     */
114
    private function calculateBestNodeCandidateScores(Element $node, $upscore) {
115
        if ($node->parent() instanceof Element) {
116
            $this->updateScore($node->parent(), $upscore);
117
            $this->updateNodeCount($node->parent(), 1);
118
119
            if ($node->parent()->parent() instanceof Element) {
120
                $this->updateScore($node->parent()->parent(), $upscore / 2);
121
                $this->updateNodeCount($node->parent()->parent(), 1);
122
            }
123
        }
124
    }
125
126
    /**
127
     * @param Element $node
128
     * @param array $nodeCandidates
129
     */
130
    private function updateBestNodeCandidates(Element $node, $nodeCandidates) {
131 View Code Duplication
        if (!in_array($node->parent(), $nodeCandidates, true)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
132
            if ($node->parent() instanceof Element) {
133
                $nodeCandidates[] = $node->parent();
134
            }
135
        }
136
137
        if ($node->parent() instanceof Element) {
138 View Code Duplication
            if (!in_array($node->parent()->parent(), $nodeCandidates, true)) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
139
                if ($node->parent()->parent() instanceof Element) {
140
                    $nodeCandidates[] = $node->parent()->parent();
141
                }
142
            }
143
        }
144
145
        return $nodeCandidates;
146
    }
147
148
    /**
149
     * We're going to start looking for where the clusters of paragraphs are. We'll score a cluster based on the number of stopwords
150
     * and the number of consecutive paragraphs together, which should form the cluster of text that this node is around
151
     * also store on how high up the paragraphs are, comments are usually at the bottom and should get a lower score
152
     *
153
     * @return Element|null
154
     */
155
    public function getTopNode() {
156
        $nodes = $this->getTopNodeCandidatesByContents($this->article());
157
158
        $nodeCandidates = [];
159
160
        $i = 0;
161
        foreach ($nodes as $node) {
162
            if ($this->isOkToBoost($node)) {
163
                $upscore = $this->getTopNodeCandidateScore($node, $i, count($nodes));
164
165
                $this->calculateBestNodeCandidateScores($node, $upscore);
166
                $nodeCandidates = $this->updateBestNodeCandidates($node, $nodeCandidates);
167
168
                $i++;
169
            }
170
        }
171
172
        return $this->getTopNodeByScore($nodeCandidates);
173
    }
174
175
    /**
176
     * A lot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to
177
     * boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs
178
     * so we'll want to make sure that the next sibling is a paragraph and has at least some substantial weight to it
179
     *
180
     * @param Element $node
181
     *
182
     * @return bool
183
     */
184
    private function isOkToBoost(Element $node) {
185
        $stepsAway = 0;
186
        $minimumStopWordCount = 5;
187
        $maxStepsAwayFromNode = 3;
188
189
        // Find all previous sibling element nodes
190
        $siblings = $node->precedingAll(function($node) {
191
            return $node instanceof Element;
192
        });
193
194
        foreach ($siblings as $sibling) {
195
            if ($sibling->is('p, strong')) {
196
                if ($stepsAway >= $maxStepsAwayFromNode) {
197
                    return false;
198
                }
199
200
                $wordStats = $this->config()->getStopWords()->getStopwordCount($sibling->text());
201
202
                if ($wordStats->getStopWordCount() > $minimumStopWordCount) {
203
                    return true;
204
                }
205
206
                $stepsAway += 1;
207
            }
208
        }
209
210
        return false;
211
    }
212
}
213