Total Complexity | 62 |
Total Lines | 375 |
Duplicated Lines | 0 % |
Changes | 5 | ||
Bugs | 1 | Features | 0 |
Complex classes like OutputFormatter often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use OutputFormatter, and based on these observations, apply Extract Interface, too.
1 | <?php declare(strict_types=1); |
||
17 | class OutputFormatter extends AbstractModule implements ModuleInterface { |
||
18 | use ArticleMutatorTrait, NodeGravityTrait, NodeCommonTrait; |
||
19 | |||
20 | /** @var float */ |
||
21 | protected static $SIBLING_BASE_LINE_SCORE = 0.30; |
||
22 | |||
23 | /** @var string */ |
||
24 | protected static $CLEANUP_IGNORE_SELECTOR = ':not(p):not(strong):not(h1):not(h2):not(h3):not(h4):not(h5):not(h6)'; |
||
25 | |||
26 | /** @inheritdoc */ |
||
27 | public function run(Article $article) { |
||
28 | $this->article($article); |
||
29 | |||
30 | if ($this->article()->getTopNode() instanceof Element) { |
||
31 | $this->postExtractionCleanup(); |
||
32 | |||
33 | $article->setCleanedArticleText($this->getFormattedText()); |
||
34 | $article->setHtmlArticle($this->cleanupHtml()); |
||
35 | } |
||
36 | |||
37 | return $this; |
||
38 | } |
||
39 | |||
40 | /** |
||
41 | * Removes all unnecessary elements and formats the selected text nodes |
||
42 | * |
||
43 | * @return string Formatted string with all HTML removed |
||
44 | */ |
||
45 | private function getFormattedText(): string { |
||
46 | $this->removeNodesWithNegativeScores($this->article()->getTopNode()); |
||
|
|||
47 | $this->convertLinksToText($this->article()->getTopNode()); |
||
48 | $this->replaceTagsWithText($this->article()->getTopNode()); |
||
49 | $this->removeParagraphsWithFewWords($this->article()->getTopNode()); |
||
50 | |||
51 | return $this->convertToText($this->article()->getTopNode()); |
||
52 | } |
||
53 | |||
54 | /** |
||
55 | * Takes an element and turns the P tags into \n\n |
||
56 | * |
||
57 | * @param Element $topNode The top most node to format |
||
58 | * |
||
59 | * @return string |
||
60 | */ |
||
61 | private function convertToText(Element $topNode): string { |
||
62 | if (empty($topNode)) { |
||
63 | return ''; |
||
64 | } |
||
65 | |||
66 | $list = []; |
||
67 | foreach ($topNode->contents() as $child) { |
||
68 | $list[] = trim($child->text()); |
||
69 | } |
||
70 | |||
71 | return implode("\n\n", $list); |
||
72 | } |
||
73 | |||
74 | /** |
||
75 | * Scrape the node content and return the html |
||
76 | * |
||
77 | * @return string Formatted string with all HTML |
||
78 | */ |
||
79 | private function cleanupHtml(): string { |
||
80 | $topNode = $this->article()->getTopNode(); |
||
81 | |||
82 | if (empty($topNode)) { |
||
83 | return ''; |
||
84 | } |
||
85 | |||
86 | $this->removeParagraphsWithFewWords($topNode); |
||
87 | |||
88 | $html = $this->convertToHtml($topNode); |
||
89 | |||
90 | return str_replace(['<p></p>', '<p> </p>'], '', $html); |
||
91 | } |
||
92 | |||
93 | /** |
||
94 | * @param Element $topNode |
||
95 | * |
||
96 | * @return string |
||
97 | */ |
||
98 | private function convertToHtml(Element $topNode): string { |
||
99 | if (empty($topNode)) { |
||
100 | return ''; |
||
101 | } |
||
102 | |||
103 | return $topNode->ownerDocument->saveHTML($topNode); |
||
104 | } |
||
105 | |||
106 | /** |
||
107 | * cleans up and converts any nodes that should be considered text into text |
||
108 | * |
||
109 | * @param Element $topNode |
||
110 | * |
||
111 | * @return self |
||
112 | */ |
||
113 | private function convertLinksToText(Element $topNode): self { |
||
114 | if (!empty($topNode)) { |
||
115 | $links = $topNode->find('a'); |
||
116 | |||
117 | foreach ($links as $item) { |
||
118 | $images = $item->find('img'); |
||
119 | |||
120 | if ($images->count() == 0) { |
||
121 | $item->substituteWith(new Text(Helper::textNormalise($item->text()))); |
||
122 | } |
||
123 | } |
||
124 | } |
||
125 | |||
126 | return $this; |
||
127 | } |
||
128 | |||
129 | /** |
||
130 | * if there are elements inside our top node that have a negative gravity score, let's |
||
131 | * give em the boot |
||
132 | * |
||
133 | * @param Element $topNode |
||
134 | * |
||
135 | * @return self |
||
136 | */ |
||
137 | private function removeNodesWithNegativeScores(Element $topNode): self { |
||
138 | if (!empty($topNode)) { |
||
139 | $gravityItems = $topNode->find('*[gravityScore]'); |
||
140 | |||
141 | foreach ($gravityItems as $item) { |
||
142 | $score = (int)$item->attr('gravityScore'); |
||
143 | |||
144 | if ($score < 1) { |
||
145 | $item->destroy(); |
||
146 | } |
||
147 | } |
||
148 | } |
||
149 | |||
150 | return $this; |
||
151 | } |
||
152 | |||
153 | /** |
||
154 | * replace common tags with just text so we don't have any crazy formatting issues |
||
155 | * so replace <br>, <i>, <strong>, etc.... with whatever text is inside them |
||
156 | * |
||
157 | * replaces header tags h1 ... h6 with newline padded text |
||
158 | * |
||
159 | * @param Element $topNode |
||
160 | * |
||
161 | * @return self |
||
162 | */ |
||
163 | private function replaceTagsWithText(Element $topNode): self { |
||
164 | if (!empty($topNode)) { |
||
165 | $items = $topNode->find('b, strong, i'); |
||
166 | |||
167 | foreach ($items as $item) { |
||
168 | $item->substituteWith(new Text($this->getTagCleanedText($item))); |
||
169 | } |
||
170 | |||
171 | $headers = $topNode->find('h1, h2, h3, h4, h5, h6'); |
||
172 | |||
173 | foreach ($headers as $header) { |
||
174 | $header->substituteWith(new Text("\n\n" . $this->getTagCleanedText($header) . "\n\n")); |
||
175 | } |
||
176 | } |
||
177 | |||
178 | return $this; |
||
179 | } |
||
180 | |||
181 | /** |
||
182 | * @todo Implement |
||
183 | * |
||
184 | * @param Element $item |
||
185 | * |
||
186 | * @return string |
||
187 | */ |
||
188 | private function getTagCleanedText(Element $item): string { |
||
190 | } |
||
191 | |||
192 | /** |
||
193 | * remove paragraphs that have less than x number of words, would indicate that it's some sort of link |
||
194 | * |
||
195 | * @param Element $topNode |
||
196 | * |
||
197 | * @return self |
||
198 | */ |
||
199 | private function removeParagraphsWithFewWords(Element $topNode): self { |
||
200 | if (!empty($topNode)) { |
||
201 | $nodes = $topNode->find('p'); |
||
202 | |||
203 | foreach ($nodes as $node) { |
||
204 | $stopWords = $this->config()->getStopWords()->getStopwordCount($node->text()); |
||
205 | |||
206 | if (mb_strlen(Helper::textNormalise($node->text())) < 8 && $stopWords->getStopWordCount() < 3 && $node->find('object')->count() == 0 && $node->find('embed')->count() == 0) { |
||
207 | $node->destroy(); |
||
208 | } |
||
209 | } |
||
210 | |||
211 | /** @todo Implement */ |
||
212 | } |
||
213 | |||
214 | return $this; |
||
215 | } |
||
216 | |||
217 | /** |
||
218 | * Remove any divs that looks like non-content, clusters of links, or paras with no gusto |
||
219 | * |
||
220 | * @return self |
||
221 | */ |
||
222 | private function postExtractionCleanup(): self { |
||
223 | $this->addSiblings($this->article()->getTopNode()); |
||
224 | |||
225 | foreach ($this->article()->getTopNode()->contents() as $node) { |
||
226 | if ($node->is(self::$CLEANUP_IGNORE_SELECTOR)) { |
||
227 | if ($this->isHighLinkDensity($node) |
||
228 | || $this->isTableTagAndNoParagraphsExist($node) |
||
229 | || !$this->isNodeScoreThreshholdMet($this->article()->getTopNode(), $node)) { |
||
230 | $node->destroy(); |
||
231 | } |
||
232 | } |
||
233 | } |
||
234 | |||
235 | return $this; |
||
236 | } |
||
237 | |||
238 | /** |
||
239 | * @param Element $topNode |
||
240 | * |
||
241 | * @return self |
||
242 | */ |
||
243 | private function removeSmallParagraphs(Element $topNode): self { |
||
244 | $nodes = $topNode->find('p, strong'); |
||
245 | |||
246 | foreach ($nodes as $node) { |
||
247 | if (mb_strlen(Helper::textNormalise($node->text())) < 25) { |
||
248 | $node->destroy(); |
||
249 | } |
||
250 | } |
||
251 | |||
252 | return $this; |
||
253 | } |
||
254 | |||
255 | /** |
||
256 | * @param Element $topNode |
||
257 | * |
||
258 | * @return bool |
||
259 | */ |
||
260 | private function isTableTagAndNoParagraphsExist(Element $topNode): bool { |
||
261 | $this->removeSmallParagraphs($topNode); |
||
262 | |||
263 | $nodes = $topNode->find('p'); |
||
264 | |||
265 | if ($nodes->count() == 0 && $topNode->is(':not(td)')) { |
||
266 | if ($topNode->is('ul, ol')) { |
||
267 | $linkTextLength = array_sum(array_map(function($value) { |
||
268 | return mb_strlen(Helper::textNormalise($value->text())); |
||
269 | }, $topNode->find('a')->toArray())); |
||
270 | |||
271 | $elementTextLength = mb_strlen(Helper::textNormalise($topNode->text())); |
||
272 | |||
273 | if ($elementTextLength > 0 && ($linkTextLength / $elementTextLength) < 0.5) { |
||
274 | return false; |
||
275 | } |
||
276 | } |
||
277 | |||
278 | return true; |
||
279 | } |
||
280 | |||
281 | return false; |
||
282 | } |
||
283 | |||
284 | /** |
||
285 | * @param Element $topNode |
||
286 | * @param Element $node |
||
287 | * |
||
288 | * @return bool |
||
289 | */ |
||
290 | private function isNodeScoreThreshholdMet(Element $topNode, Element $node): bool { |
||
291 | $topNodeScore = $this->getScore($topNode); |
||
292 | $currentNodeScore = $this->getScore($node); |
||
293 | $thresholdScore = ($topNodeScore * 0.08); |
||
294 | |||
295 | if ($currentNodeScore < $thresholdScore && $node->is(':not(td)')) { |
||
296 | return false; |
||
297 | } |
||
298 | |||
299 | return true; |
||
300 | } |
||
301 | |||
302 | /** |
||
303 | * Adds any siblings that may have a decent score to this node |
||
304 | * |
||
305 | * @param Element $currentSibling |
||
306 | * @param float $baselineScoreForSiblingParagraphs |
||
307 | * |
||
308 | * @return Element[] |
||
309 | */ |
||
310 | private function getSiblingContent(Element $currentSibling, float $baselineScoreForSiblingParagraphs): array { |
||
311 | $text = trim($currentSibling->text()); |
||
312 | |||
313 | if ($currentSibling->is('p, strong') && !empty($text)) { |
||
314 | return [$currentSibling]; |
||
315 | } |
||
316 | |||
317 | $results = []; |
||
318 | |||
319 | $nodes = $currentSibling->find('p, strong'); |
||
320 | |||
321 | foreach ($nodes as $node) { |
||
322 | $text = trim($node->text()); |
||
323 | |||
324 | if (!empty($text)) { |
||
325 | $wordStats = $this->config()->getStopWords()->getStopwordCount($text); |
||
326 | |||
327 | if (($baselineScoreForSiblingParagraphs * self::$SIBLING_BASE_LINE_SCORE) < $wordStats->getStopWordCount()) { |
||
328 | $results[] = $node->document()->createElement('p', $text); |
||
329 | } |
||
330 | } |
||
331 | } |
||
332 | |||
333 | return $results; |
||
334 | } |
||
335 | |||
336 | /** |
||
337 | * @param Element $topNode |
||
338 | * |
||
339 | * @return self |
||
340 | */ |
||
341 | private function addSiblings(Element $topNode): self { |
||
342 | $baselineScoreForSiblingParagraphs = $this->getBaselineScoreForSiblings($topNode); |
||
343 | |||
344 | $previousSiblings = $topNode->precedingAll(function($node) { |
||
345 | return $node instanceof Element; |
||
346 | }); |
||
347 | |||
348 | // Find all previous sibling element nodes |
||
349 | foreach ($previousSiblings as $siblingNode) { |
||
350 | $results = $this->getSiblingContent($siblingNode, $baselineScoreForSiblingParagraphs); |
||
351 | |||
352 | foreach ($results as $result) { |
||
353 | $topNode->insertBefore($result, $topNode->firstChild); |
||
354 | } |
||
355 | } |
||
356 | |||
357 | return $this; |
||
358 | } |
||
359 | |||
360 | /** |
||
361 | * we could have long articles that have tons of paragraphs so if we tried to calculate the base score against |
||
362 | * the total text score of those paragraphs it would be unfair. So we need to normalize the score based on the average scoring |
||
363 | * of the paragraphs within the top node. For example if our total score of 10 paragraphs was 1000 but each had an average value of |
||
364 | * 100 then 100 should be our base. |
||
365 | * |
||
366 | * @param Element $topNode |
||
367 | * |
||
368 | * @return float |
||
369 | */ |
||
370 | private function getBaselineScoreForSiblings(Element $topNode): float { |
||
392 | } |
||
393 | } |
||
394 |