ContentExtractor::run()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 6
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 3
dl 0
loc 6
c 0
b 0
f 0
rs 10
cc 1
nc 1
nop 1
1
<?php declare(strict_types=1);
2
3
namespace Goose\Modules\Extractors;
4
5
use Goose\Article;
6
use Goose\Traits\{NodeCommonTrait, NodeGravityTrait, ArticleMutatorTrait};
7
use Goose\Modules\{AbstractModule, ModuleInterface};
8
use DOMWrap\Element;
9
10
/**
11
 * Content Extractor
12
 *
13
 * @package Goose\Modules\Extractors
14
 * @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
15
 */
16
class ContentExtractor extends AbstractModule implements ModuleInterface {
17
    use ArticleMutatorTrait, NodeGravityTrait, NodeCommonTrait;
18
19
    /** @inheritdoc  */
20
    public function run(Article $article): self {
21
        $this->article($article);
22
23
        $article->setTopNode($this->getTopNode());
24
25
        return $this;
26
    }
27
28
    /**
29
     * @param Article $article
30
     *
31
     * @return array
32
     */
33
    private function getTopNodeCandidatesByContents(Article $article): array {
34
        $results = [];
35
36
        $nodes = $article->getDoc()->find('p, td, pre');
37
38
        foreach ($nodes as $node) {
39
            $wordStats = $this->config()->getStopWords()->getStopwordCount($node->text());
40
            $highLinkDensity = $this->isHighLinkDensity($node);
41
42
            if ($wordStats->getStopWordCount() > 2 && !$highLinkDensity) {
43
                $results[] = $node;
44
            }
45
        }
46
47
        return $results;
48
    }
49
50
    /**
51
     * @param Element $node
52
     * @param int $i
53
     * @param int $totalNodes
54
     *
55
     * @return float
56
     */
57
    private function getTopNodeCandidateScore(Element $node, int $i, int $totalNodes): float {
58
        $boostScore = (1.0 / ($i + 1)) * 50;
59
        $bottomNodesForNegativeScore = $totalNodes * 0.25;
60
61
        if ($totalNodes > 15) {
62
            if ($totalNodes - $i <= $bottomNodesForNegativeScore) {
63
                $booster = $bottomNodesForNegativeScore - ($totalNodes - $i);
64
                $boostScore = pow($booster, 2) * -1;
65
                $negscore = abs($boostScore);
66
                if ($negscore > 40) {
67
                    $boostScore = 5;
68
                }
69
            }
70
        }
71
72
        $wordStats = $this->config()->getStopWords()->getStopwordCount($node->text());
73
        $upscore = $wordStats->getStopWordCount() + $boostScore;
74
75
        return $upscore;
76
    }
77
78
    /**
79
     * @param array $nodes
80
     *
81
     * @return Element|null
82
     */
83
    private function getTopNodeByScore(array $nodes): ?Element {
84
        $topNode = null;
85
        $topNodeScore = 0;
86
87
        foreach ($nodes as $node) {
88
            $score = $this->getScore($node);
89
90
            if ($score > $topNodeScore) {
91
                $topNode = $node;
92
                $topNodeScore = $score;
93
            }
94
95
            if ($topNode === false) {
96
                $topNode = $node;
97
            }
98
        }
99
100
        if ($topNode && $this->getScore($topNode) < 20) {
101
            return null;
102
        }
103
104
        return $topNode;
105
    }
106
107
    /**
108
     * @param Element $node
109
     * @param float $upscore
110
     *
111
     * @return self
112
     */
113
    private function calculateBestNodeCandidateScores(Element $node, float $upscore): self {
114
        if ($node->parent() instanceof Element) {
115
            $this->updateScore($node->parent(), $upscore);
116
            $this->updateNodeCount($node->parent(), 1);
117
118
            if ($node->parent()->parent() instanceof Element) {
119
                $this->updateScore($node->parent()->parent(), $upscore / 2);
120
                $this->updateNodeCount($node->parent()->parent(), 1);
121
            }
122
        }
123
124
        return $this;
125
    }
126
127
    /**
128
     * @param Element $node
129
     * @param array $nodeCandidates
130
     *
131
     * @return array
132
     */
133
    private function updateBestNodeCandidates(Element $node, array $nodeCandidates): array {
134
        if (!in_array($node->parent(), $nodeCandidates, true)) {
135
            if ($node->parent() instanceof Element) {
136
                $nodeCandidates[] = $node->parent();
137
            }
138
        }
139
140
        if ($node->parent() instanceof Element) {
141
            if (!in_array($node->parent()->parent(), $nodeCandidates, true)) {
142
                if ($node->parent()->parent() instanceof Element) {
143
                    $nodeCandidates[] = $node->parent()->parent();
144
                }
145
            }
146
        }
147
148
        return $nodeCandidates;
149
    }
150
151
    /**
152
     * We're going to start looking for where the clusters of paragraphs are. We'll score a cluster based on the number of stopwords
153
     * and the number of consecutive paragraphs together, which should form the cluster of text that this node is around
154
     * also store on how high up the paragraphs are, comments are usually at the bottom and should get a lower score
155
     *
156
     * @return Element|null
157
     */
158
    public function getTopNode(): ?Element {
159
        $nodes = $this->getTopNodeCandidatesByContents($this->article());
160
161
        $nodeCandidates = [];
162
163
        $i = 0;
164
        foreach ($nodes as $node) {
165
            if ($this->isOkToBoost($node)) {
166
                $upscore = $this->getTopNodeCandidateScore($node, $i, count($nodes));
167
168
                $this->calculateBestNodeCandidateScores($node, $upscore);
169
                $nodeCandidates = $this->updateBestNodeCandidates($node, $nodeCandidates);
170
171
                $i++;
172
            }
173
        }
174
175
        return $this->getTopNodeByScore($nodeCandidates);
176
    }
177
178
    /**
179
     * A lot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to
180
     * boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs
181
     * so we'll want to make sure that the next sibling is a paragraph and has at least some substantial weight to it
182
     *
183
     * @param Element $node
184
     *
185
     * @return bool
186
     */
187
    private function isOkToBoost(Element $node): bool {
188
        $stepsAway = 0;
189
        $minimumStopWordCount = 5;
190
        $maxStepsAwayFromNode = 3;
191
192
        // Find all previous sibling element nodes
193
        $siblings = $node->precedingAll(function($node) {
194
            return $node instanceof Element;
195
        });
196
197
        foreach ($siblings as $sibling) {
198
            if ($sibling->is('p, strong')) {
199
                if ($stepsAway >= $maxStepsAwayFromNode) {
200
                    return false;
201
                }
202
203
                $wordStats = $this->config()->getStopWords()->getStopwordCount($sibling->text());
204
205
                if ($wordStats->getStopWordCount() > $minimumStopWordCount) {
206
                    return true;
207
                }
208
209
                $stepsAway += 1;
210
            }
211
        }
212
213
        return false;
214
    }
215
}
216