1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Goose\Modules\Extractors; |
4
|
|
|
|
5
|
|
|
use Goose\Article; |
6
|
|
|
use Goose\Traits\NodeCommonTrait; |
7
|
|
|
use Goose\Traits\NodeGravityTrait; |
8
|
|
|
use Goose\Traits\ArticleMutatorTrait; |
9
|
|
|
use Goose\Modules\AbstractModule; |
10
|
|
|
use Goose\Modules\ModuleInterface; |
11
|
|
|
use DOMWrap\Element; |
12
|
|
|
|
13
|
|
|
/** |
14
|
|
|
* Content Extractor |
15
|
|
|
* |
16
|
|
|
* @package Goose\Modules\Extractors |
17
|
|
|
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0 |
18
|
|
|
*/ |
19
|
|
|
class ContentExtractor extends AbstractModule implements ModuleInterface { |
20
|
|
|
use ArticleMutatorTrait, NodeGravityTrait, NodeCommonTrait; |
21
|
|
|
|
22
|
|
|
/** |
23
|
|
|
* @param Article $article |
24
|
|
|
*/ |
25
|
|
|
public function run(Article $article) { |
26
|
|
|
$this->article($article); |
27
|
|
|
|
28
|
|
|
$article->setTopNode($this->getTopNode()); |
29
|
|
|
} |
30
|
|
|
|
31
|
|
|
/** |
32
|
|
|
* @param Article $article |
33
|
|
|
* |
34
|
|
|
* @return array |
35
|
|
|
*/ |
36
|
|
|
private function getTopNodeCandidatesByContents(Article $article) { |
37
|
|
|
$results = []; |
38
|
|
|
|
39
|
|
|
$nodes = $article->getDoc()->find('p, td, pre'); |
40
|
|
|
|
41
|
|
|
foreach ($nodes as $node) { |
42
|
|
|
$wordStats = $this->config()->getStopWords()->getStopwordCount($node->text()); |
43
|
|
|
$highLinkDensity = $this->isHighLinkDensity($node); |
44
|
|
|
|
45
|
|
|
if ($wordStats->getStopWordCount() > 2 && !$highLinkDensity) { |
46
|
|
|
$results[] = $node; |
47
|
|
|
} |
48
|
|
|
} |
49
|
|
|
|
50
|
|
|
return $results; |
51
|
|
|
} |
52
|
|
|
|
53
|
|
|
/** |
54
|
|
|
* @param Element $node |
55
|
|
|
* @param int $i |
56
|
|
|
* @param int $totalNodes |
57
|
|
|
* |
58
|
|
|
* @return double |
59
|
|
|
*/ |
60
|
|
|
private function getTopNodeCandidateScore(Element $node, $i, $totalNodes) { |
61
|
|
|
$boostScore = (1.0 / ($i + 1)) * 50; |
62
|
|
|
$bottomNodesForNegativeScore = $totalNodes * 0.25; |
63
|
|
|
|
64
|
|
|
if ($totalNodes > 15) { |
65
|
|
|
if ($totalNodes - $i <= $bottomNodesForNegativeScore) { |
66
|
|
|
$booster = $bottomNodesForNegativeScore - ($totalNodes - $i); |
67
|
|
|
$boostScore = pow($booster, 2) * -1; |
68
|
|
|
$negscore = abs($boostScore); |
69
|
|
|
if ($negscore > 40) { |
70
|
|
|
$boostScore = 5; |
71
|
|
|
} |
72
|
|
|
} |
73
|
|
|
} |
74
|
|
|
|
75
|
|
|
$wordStats = $this->config()->getStopWords()->getStopwordCount($node->text()); |
76
|
|
|
$upscore = $wordStats->getStopWordCount() + $boostScore; |
77
|
|
|
|
78
|
|
|
return $upscore; |
79
|
|
|
} |
80
|
|
|
|
81
|
|
|
/** |
82
|
|
|
* @param array $nodes |
83
|
|
|
* |
84
|
|
|
* @return Element|null |
85
|
|
|
*/ |
86
|
|
|
private function getTopNodeByScore($nodes) { |
87
|
|
|
$topNode = null; |
88
|
|
|
$topNodeScore = 0; |
89
|
|
|
|
90
|
|
|
foreach ($nodes as $node) { |
91
|
|
|
$score = $this->getScore($node); |
92
|
|
|
|
93
|
|
|
if ($score > $topNodeScore) { |
94
|
|
|
$topNode = $node; |
95
|
|
|
$topNodeScore = $score; |
96
|
|
|
} |
97
|
|
|
|
98
|
|
|
if ($topNode === false) { |
99
|
|
|
$topNode = $node; |
100
|
|
|
} |
101
|
|
|
} |
102
|
|
|
|
103
|
|
|
if ($topNode && $this->getScore($topNode) < 20) { |
104
|
|
|
return null; |
105
|
|
|
} |
106
|
|
|
|
107
|
|
|
return $topNode; |
108
|
|
|
} |
109
|
|
|
|
110
|
|
|
/** |
111
|
|
|
* @param Element $node |
112
|
|
|
* @param double $upscore |
113
|
|
|
*/ |
114
|
|
|
private function calculateBestNodeCandidateScores(Element $node, $upscore) { |
115
|
|
|
if ($node->parent() instanceof Element) { |
116
|
|
|
$this->updateScore($node->parent(), $upscore); |
117
|
|
|
$this->updateNodeCount($node->parent(), 1); |
118
|
|
|
|
119
|
|
|
if ($node->parent()->parent() instanceof Element) { |
120
|
|
|
$this->updateScore($node->parent()->parent(), $upscore / 2); |
121
|
|
|
$this->updateNodeCount($node->parent()->parent(), 1); |
122
|
|
|
} |
123
|
|
|
} |
124
|
|
|
} |
125
|
|
|
|
126
|
|
|
/** |
127
|
|
|
* @param Element $node |
128
|
|
|
* @param array $nodeCandidates |
129
|
|
|
*/ |
130
|
|
|
private function updateBestNodeCandidates(Element $node, $nodeCandidates) { |
131
|
|
View Code Duplication |
if (!in_array($node->parent(), $nodeCandidates, true)) { |
|
|
|
|
132
|
|
|
if ($node->parent() instanceof Element) { |
133
|
|
|
$nodeCandidates[] = $node->parent(); |
134
|
|
|
} |
135
|
|
|
} |
136
|
|
|
|
137
|
|
|
if ($node->parent() instanceof Element) { |
138
|
|
View Code Duplication |
if (!in_array($node->parent()->parent(), $nodeCandidates, true)) { |
|
|
|
|
139
|
|
|
if ($node->parent()->parent() instanceof Element) { |
140
|
|
|
$nodeCandidates[] = $node->parent()->parent(); |
141
|
|
|
} |
142
|
|
|
} |
143
|
|
|
} |
144
|
|
|
|
145
|
|
|
return $nodeCandidates; |
146
|
|
|
} |
147
|
|
|
|
148
|
|
|
/** |
149
|
|
|
* We're going to start looking for where the clusters of paragraphs are. We'll score a cluster based on the number of stopwords |
150
|
|
|
* and the number of consecutive paragraphs together, which should form the cluster of text that this node is around |
151
|
|
|
* also store on how high up the paragraphs are, comments are usually at the bottom and should get a lower score |
152
|
|
|
* |
153
|
|
|
* @return Element|null |
154
|
|
|
*/ |
155
|
|
|
public function getTopNode() { |
156
|
|
|
$nodes = $this->getTopNodeCandidatesByContents($this->article()); |
157
|
|
|
|
158
|
|
|
$nodeCandidates = []; |
159
|
|
|
|
160
|
|
|
$i = 0; |
161
|
|
|
foreach ($nodes as $node) { |
162
|
|
|
if ($this->isOkToBoost($node)) { |
163
|
|
|
$upscore = $this->getTopNodeCandidateScore($node, $i, count($nodes)); |
164
|
|
|
|
165
|
|
|
$this->calculateBestNodeCandidateScores($node, $upscore); |
166
|
|
|
$nodeCandidates = $this->updateBestNodeCandidates($node, $nodeCandidates); |
167
|
|
|
|
168
|
|
|
$i++; |
169
|
|
|
} |
170
|
|
|
} |
171
|
|
|
|
172
|
|
|
return $this->getTopNodeByScore($nodeCandidates); |
173
|
|
|
} |
174
|
|
|
|
175
|
|
|
/** |
176
|
|
|
* A lot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to |
177
|
|
|
* boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs |
178
|
|
|
* so we'll want to make sure that the next sibling is a paragraph and has at least some substantial weight to it |
179
|
|
|
* |
180
|
|
|
* @param Element $node |
181
|
|
|
* |
182
|
|
|
* @return bool |
183
|
|
|
*/ |
184
|
|
|
private function isOkToBoost(Element $node) { |
185
|
|
|
$stepsAway = 0; |
186
|
|
|
$minimumStopWordCount = 5; |
187
|
|
|
$maxStepsAwayFromNode = 3; |
188
|
|
|
|
189
|
|
|
// Find all previous sibling element nodes |
190
|
|
|
$siblings = $node->precedingAll(function($node) { |
191
|
|
|
return $node instanceof Element; |
192
|
|
|
}); |
193
|
|
|
|
194
|
|
|
foreach ($siblings as $sibling) { |
195
|
|
|
if ($sibling->is('p, strong')) { |
196
|
|
|
if ($stepsAway >= $maxStepsAwayFromNode) { |
197
|
|
|
return false; |
198
|
|
|
} |
199
|
|
|
|
200
|
|
|
$wordStats = $this->config()->getStopWords()->getStopwordCount($sibling->text()); |
201
|
|
|
|
202
|
|
|
if ($wordStats->getStopWordCount() > $minimumStopWordCount) { |
203
|
|
|
return true; |
204
|
|
|
} |
205
|
|
|
|
206
|
|
|
$stepsAway += 1; |
207
|
|
|
} |
208
|
|
|
} |
209
|
|
|
|
210
|
|
|
return false; |
211
|
|
|
} |
212
|
|
|
} |
213
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.