Total Complexity | 295 |
Total Lines | 1788 |
Duplicated Lines | 0 % |
Changes | 2 | ||
Bugs | 0 | Features | 1 |
Complex classes like Readability often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Readability, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
15 | class Readability |
||
16 | { |
||
17 | /** |
||
18 | * Main DOMDocument where all the magic happens. |
||
19 | * |
||
20 | * @var DOMDocument |
||
21 | */ |
||
22 | protected $dom; |
||
23 | |||
24 | /** |
||
25 | * Title of the article. |
||
26 | * |
||
27 | * @var string|null |
||
28 | */ |
||
29 | protected $title = null; |
||
30 | |||
31 | /** |
||
32 | * Final DOMDocument with the fully parsed HTML. |
||
33 | * |
||
34 | * @var DOMDocument|null |
||
35 | */ |
||
36 | protected $content = null; |
||
37 | |||
38 | /** |
||
39 | * Excerpt of the article. |
||
40 | * |
||
41 | * @var string|null |
||
42 | */ |
||
43 | protected $excerpt = null; |
||
44 | |||
45 | /** |
||
46 | * Main image of the article. |
||
47 | * |
||
48 | * @var string|null |
||
49 | */ |
||
50 | protected $image = null; |
||
51 | |||
52 | /** |
||
53 | * Author of the article. Extracted from the byline tags and other social media properties. |
||
54 | * |
||
55 | * @var string|null |
||
56 | */ |
||
57 | protected $author = null; |
||
58 | |||
59 | /** |
||
60 | * Website name. |
||
61 | * |
||
62 | * @var string|null |
||
63 | */ |
||
64 | protected $siteName = null; |
||
65 | |||
66 | /** |
||
67 | * Direction of the text. |
||
68 | * |
||
69 | * @var string|null |
||
70 | */ |
||
71 | protected $direction = null; |
||
72 | |||
73 | /** |
||
74 | * Configuration object. |
||
75 | * |
||
76 | * @var Configuration |
||
77 | */ |
||
78 | private $configuration; |
||
79 | |||
80 | /** |
||
81 | * Logger object. |
||
82 | * |
||
83 | * @var LoggerInterface |
||
84 | */ |
||
85 | private $logger; |
||
86 | |||
87 | /** |
||
88 | * Collection of attempted text extractions. |
||
89 | * |
||
90 | * @var array |
||
91 | */ |
||
92 | private $attempts = []; |
||
93 | |||
94 | /** |
||
95 | * @var array |
||
96 | */ |
||
97 | private $defaultTagsToScore = [ |
||
98 | 'section', |
||
99 | 'h2', |
||
100 | 'h3', |
||
101 | 'h4', |
||
102 | 'h5', |
||
103 | 'h6', |
||
104 | 'p', |
||
105 | 'td', |
||
106 | 'pre', |
||
107 | ]; |
||
108 | |||
109 | /** |
||
110 | * @var array |
||
111 | */ |
||
112 | private $alterToDIVExceptions = [ |
||
113 | 'div', |
||
114 | 'article', |
||
115 | 'section', |
||
116 | 'p', |
||
117 | ]; |
||
118 | |||
119 | /** |
||
120 | * Readability constructor. |
||
121 | * |
||
122 | * @param Configuration $configuration |
||
123 | */ |
||
124 | public function __construct(Configuration $configuration) |
||
125 | { |
||
126 | $this->configuration = $configuration; |
||
127 | $this->logger = $this->configuration->getLogger(); |
||
128 | } |
||
129 | |||
130 | /** |
||
131 | * Main parse function. |
||
132 | * |
||
133 | * @param $html |
||
134 | * |
||
135 | * @throws ParseException |
||
136 | * |
||
137 | * @return bool |
||
138 | */ |
||
139 | public function parse($html) |
||
140 | { |
||
141 | $this->logger->info('*** Starting parse process...'); |
||
142 | |||
143 | $this->dom = $this->loadHTML($html); |
||
144 | |||
145 | // Checking for minimum HTML to work with. |
||
146 | if (!($root = $this->dom->getElementsByTagName('body')->item(0)) || !$root->firstChild) { |
||
147 | $this->logger->emergency('No body tag present or body tag empty'); |
||
148 | |||
149 | throw new ParseException('Invalid or incomplete HTML.'); |
||
150 | } |
||
151 | |||
152 | $this->getMetadata(); |
||
153 | |||
154 | $this->getMainImage(); |
||
155 | |||
156 | while (true) { |
||
157 | $root = $root->firstChild; |
||
158 | |||
159 | $elementsToScore = $this->getNodes($root); |
||
160 | $this->logger->debug(sprintf('Elements to score: \'%s\'', count($elementsToScore))); |
||
161 | |||
162 | $result = $this->rateNodes($elementsToScore); |
||
163 | |||
164 | /* |
||
165 | * Now that we've gone through the full algorithm, check to see if |
||
166 | * we got any meaningful content. If we didn't, we may need to re-run |
||
167 | * grabArticle with different flags set. This gives us a higher likelihood of |
||
168 | * finding the content, and the sieve approach gives us a higher likelihood of |
||
169 | * finding the -right- content. |
||
170 | */ |
||
171 | |||
172 | $length = mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->textContent)); |
||
173 | |||
174 | $this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getCharThreshold())); |
||
175 | |||
176 | if ($result && $length < $this->configuration->getCharThreshold()) { |
||
177 | $this->dom = $this->loadHTML($html); |
||
178 | $root = $this->dom->getElementsByTagName('body')->item(0); |
||
179 | |||
180 | if ($this->configuration->getStripUnlikelyCandidates()) { |
||
181 | $this->logger->debug('[Parsing] Threshold not met, trying again setting StripUnlikelyCandidates as false'); |
||
182 | $this->configuration->setStripUnlikelyCandidates(false); |
||
183 | $this->attempts[] = ['articleContent' => $result, 'textLength' => $length]; |
||
184 | } elseif ($this->configuration->getWeightClasses()) { |
||
185 | $this->logger->debug('[Parsing] Threshold not met, trying again setting WeightClasses as false'); |
||
186 | $this->configuration->setWeightClasses(false); |
||
187 | $this->attempts[] = ['articleContent' => $result, 'textLength' => $length]; |
||
188 | } elseif ($this->configuration->getCleanConditionally()) { |
||
189 | $this->logger->debug('[Parsing] Threshold not met, trying again setting CleanConditionally as false'); |
||
190 | $this->configuration->setCleanConditionally(false); |
||
191 | $this->attempts[] = ['articleContent' => $result, 'textLength' => $length]; |
||
192 | } else { |
||
193 | $this->logger->debug('[Parsing] Threshold not met, searching across attempts for some content.'); |
||
194 | $this->attempts[] = ['articleContent' => $result, 'textLength' => $length]; |
||
195 | |||
196 | // No luck after removing flags, just return the longest text we found during the different loops |
||
197 | usort($this->attempts, function($a, $b) { |
||
198 | return $a['textLength'] < $b['textLength']; |
||
199 | }); |
||
200 | |||
201 | // But first check if we actually have something |
||
202 | if (!$this->attempts[0]['textLength']) { |
||
203 | $this->logger->emergency('[Parsing] Could not parse text, giving up :('); |
||
204 | |||
205 | throw new ParseException('Could not parse text.'); |
||
206 | } |
||
207 | |||
208 | $this->logger->debug('[Parsing] Threshold not met, but found some content in previous attempts.'); |
||
209 | |||
210 | $result = $this->attempts[0]['articleContent']; |
||
211 | break; |
||
212 | } |
||
213 | } else { |
||
214 | break; |
||
215 | } |
||
216 | } |
||
217 | |||
218 | $result = $this->postProcessContent($result); |
||
|
|||
219 | |||
220 | // If we haven't found an excerpt in the article's metadata, use the article's |
||
221 | // first paragraph as the excerpt. This can be used for displaying a preview of |
||
222 | // the article's content. |
||
223 | if (!$this->getExcerpt()) { |
||
224 | $this->logger->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.'); |
||
225 | $paragraphs = $result->getElementsByTagName('p'); |
||
226 | if ($paragraphs->length > 0) { |
||
227 | $this->setExcerpt(trim($paragraphs->item(0)->textContent)); |
||
228 | } |
||
229 | } |
||
230 | |||
231 | $this->setContent($result); |
||
232 | |||
233 | $this->logger->info('*** Parse successful :)'); |
||
234 | |||
235 | return true; |
||
236 | } |
||
237 | |||
238 | /** |
||
239 | * Creates a DOM Document object and loads the provided HTML on it. |
||
240 | * |
||
241 | * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text) |
||
242 | * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs |
||
243 | * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both |
||
244 | * objects and ruining the backup. |
||
245 | * |
||
246 | * @param string $html |
||
247 | * |
||
248 | * @return DOMDocument |
||
249 | */ |
||
250 | private function loadHTML($html) |
||
251 | { |
||
252 | $this->logger->debug('[Loading] Loading HTML...'); |
||
253 | |||
254 | // To avoid throwing a gazillion of errors on malformed HTMLs |
||
255 | libxml_use_internal_errors(true); |
||
256 | |||
257 | $dom = new DOMDocument('1.0', 'utf-8'); |
||
258 | |||
259 | if (!$this->configuration->getSubstituteEntities()) { |
||
260 | // Keep the original HTML entities |
||
261 | $dom->substituteEntities = false; |
||
262 | } |
||
263 | |||
264 | if ($this->configuration->getNormalizeEntities()) { |
||
265 | $this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.'); |
||
266 | // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content |
||
267 | $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); |
||
268 | } |
||
269 | |||
270 | if ($this->configuration->getSummonCthulhu()) { |
||
271 | $this->logger->debug('[Loading] Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘'); |
||
272 | $html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html); |
||
273 | } |
||
274 | |||
275 | // Prepend the XML tag to avoid having issues with special characters. Should be harmless. |
||
276 | $dom->loadHTML('<?xml encoding="UTF-8">'.$html); |
||
277 | $dom->encoding = 'UTF-8'; |
||
278 | |||
279 | $this->removeScripts($dom); |
||
280 | |||
281 | $this->prepDocument($dom); |
||
282 | |||
283 | $this->logger->debug('[Loading] Loaded HTML successfully.'); |
||
284 | |||
285 | return $dom; |
||
286 | } |
||
287 | |||
288 | /** |
||
289 | * Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties. |
||
290 | */ |
||
291 | private function getMetadata() |
||
292 | { |
||
293 | $this->logger->debug('[Metadata] Retrieving metadata...'); |
||
294 | |||
295 | $values = []; |
||
296 | // property is a space-separated list of values |
||
297 | $propertyPattern = '/\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|image|site_name)(?!:)\s*/i'; |
||
298 | |||
299 | // name is a single value |
||
300 | $namePattern = '/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image|site_name)(?!:)\s*$/i'; |
||
301 | |||
302 | // Find description tags. |
||
303 | foreach ($this->dom->getElementsByTagName('meta') as $meta) { |
||
304 | /* @var DOMNode $meta */ |
||
305 | $elementName = $meta->getAttribute('name'); |
||
306 | $elementProperty = $meta->getAttribute('property'); |
||
307 | $content = $meta->getAttribute('content'); |
||
308 | $matches = null; |
||
309 | $name = null; |
||
310 | |||
311 | if ($elementProperty) { |
||
312 | if (preg_match($propertyPattern, $elementProperty, $matches)) { |
||
313 | for ($i = count($matches) - 1; $i >= 0; $i--) { |
||
314 | // Convert to lowercase, and remove any whitespace |
||
315 | // so we can match below. |
||
316 | $name = preg_replace('/\s/', '', mb_strtolower($matches[$i])); |
||
317 | // multiple authors |
||
318 | $values[$name] = trim($content); |
||
319 | } |
||
320 | } |
||
321 | } |
||
322 | |||
323 | if (!$matches && $elementName && preg_match($namePattern, $elementName)) { |
||
324 | $name = $elementName; |
||
325 | if ($content) { |
||
326 | // Convert to lowercase, remove any whitespace, and convert dots |
||
327 | // to colons so we can match below. |
||
328 | $name = preg_replace(['/\s/', '/\./'], ['', ':'], mb_strtolower($name)); |
||
329 | $values[$name] = trim($content); |
||
330 | } |
||
331 | } |
||
332 | } |
||
333 | |||
334 | // get title |
||
335 | /* |
||
336 | * This is a very convoluted way of extracting the first matching key of the $values array |
||
337 | * against a set of options. |
||
338 | * |
||
339 | * This could be easily replaced with an ugly set of isset($values['key']) or a bunch of ??s. |
||
340 | * Will probably replace it with ??s after dropping support of PHP5.6 |
||
341 | */ |
||
342 | $key = current(array_intersect([ |
||
343 | 'dc:title', |
||
344 | 'dcterm:title', |
||
345 | 'og:title', |
||
346 | 'weibo:article:title', |
||
347 | 'weibo:webpage:title', |
||
348 | 'title', |
||
349 | 'twitter:title' |
||
350 | ], array_keys($values))); |
||
351 | |||
352 | $this->setTitle(isset($values[$key]) ? trim($values[$key]) : null); |
||
353 | |||
354 | if (!$this->getTitle()) { |
||
355 | $this->setTitle($this->getArticleTitle()); |
||
356 | } |
||
357 | |||
358 | // get author |
||
359 | $key = current(array_intersect([ |
||
360 | 'dc:creator', |
||
361 | 'dcterm:creator', |
||
362 | 'author' |
||
363 | ], array_keys($values))); |
||
364 | |||
365 | $this->setAuthor(isset($values[$key]) ? $values[$key] : null); |
||
366 | |||
367 | // get description |
||
368 | $key = current(array_intersect([ |
||
369 | 'dc:description', |
||
370 | 'dcterm:description', |
||
371 | 'og:description', |
||
372 | 'weibo:article:description', |
||
373 | 'weibo:webpage:description', |
||
374 | 'description', |
||
375 | 'twitter:description' |
||
376 | ], array_keys($values))); |
||
377 | |||
378 | $this->setExcerpt(isset($values[$key]) ? $values[$key] : null); |
||
379 | |||
380 | // get main image |
||
381 | $key = current(array_intersect([ |
||
382 | 'image', |
||
383 | 'og:image', |
||
384 | 'twitter:image' |
||
385 | ], array_keys($values))); |
||
386 | |||
387 | $this->setImage(isset($values[$key]) ? $values[$key] : null); |
||
388 | |||
389 | $key = current(array_intersect([ |
||
390 | 'og:site_name' |
||
391 | ], array_keys($values))); |
||
392 | |||
393 | $this->setSiteName(isset($values[$key]) ? $values[$key] : null); |
||
394 | } |
||
395 | |||
396 | /** |
||
397 | * Returns all the images of the parsed article. |
||
398 | * |
||
399 | * @return array |
||
400 | */ |
||
401 | public function getImages() |
||
402 | { |
||
403 | $result = []; |
||
404 | if ($this->getImage()) { |
||
405 | $result[] = $this->getImage(); |
||
406 | } |
||
407 | |||
408 | if (null == $this->getDOMDocument()) { |
||
409 | return $result; |
||
410 | } |
||
411 | |||
412 | foreach ($this->getDOMDocument()->getElementsByTagName('img') as $img) { |
||
413 | if ($src = $img->getAttribute('src')) { |
||
414 | $result[] = $src; |
||
415 | } |
||
416 | } |
||
417 | |||
418 | if ($this->configuration->getFixRelativeURLs()) { |
||
419 | foreach ($result as &$imgSrc) { |
||
420 | $imgSrc = $this->toAbsoluteURI($imgSrc); |
||
421 | } |
||
422 | } |
||
423 | |||
424 | $result = array_unique(array_filter($result)); |
||
425 | |||
426 | return $result; |
||
427 | } |
||
428 | |||
429 | /** |
||
430 | * Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't |
||
431 | * find a correct image. |
||
432 | */ |
||
433 | public function getMainImage() |
||
434 | { |
||
435 | $imgUrl = false; |
||
436 | |||
437 | if ($this->getImage() !== null) { |
||
438 | $imgUrl = $this->getImage(); |
||
439 | } |
||
440 | |||
441 | if (!$imgUrl) { |
||
442 | foreach ($this->dom->getElementsByTagName('link') as $link) { |
||
443 | /** @var \DOMElement $link */ |
||
444 | /* |
||
445 | * Check for the rel attribute, then check if the rel attribute is either img_src or image_src, and |
||
446 | * finally check for the existence of the href attribute, which should hold the image url. |
||
447 | */ |
||
448 | if ($link->hasAttribute('rel') && ($link->getAttribute('rel') === 'img_src' || $link->getAttribute('rel') === 'image_src') && $link->hasAttribute('href')) { |
||
449 | $imgUrl = $link->getAttribute('href'); |
||
450 | break; |
||
451 | } |
||
452 | } |
||
453 | } |
||
454 | |||
455 | if (!empty($imgUrl) && $this->configuration->getFixRelativeURLs()) { |
||
456 | $this->setImage($this->toAbsoluteURI($imgUrl)); |
||
457 | } |
||
458 | } |
||
459 | |||
460 | /** |
||
461 | * Returns the title of the html. Prioritizes the title from the metadata against the title tag. |
||
462 | * |
||
463 | * @return string|null |
||
464 | */ |
||
465 | private function getArticleTitle() |
||
466 | { |
||
467 | $originalTitle = null; |
||
468 | |||
469 | if ($this->getTitle()) { |
||
470 | $originalTitle = $this->getTitle(); |
||
471 | } else { |
||
472 | $this->logger->debug('[Metadata] Could not find title in metadata, searching for the title tag...'); |
||
473 | $titleTag = $this->dom->getElementsByTagName('title'); |
||
474 | if ($titleTag->length > 0) { |
||
475 | $this->logger->info(sprintf('[Metadata] Using title tag as article title: \'%s\'', $titleTag->item(0)->nodeValue)); |
||
476 | $originalTitle = $titleTag->item(0)->nodeValue; |
||
477 | } |
||
478 | } |
||
479 | |||
480 | if ($originalTitle === null) { |
||
481 | return null; |
||
482 | } |
||
483 | |||
484 | $curTitle = $originalTitle = trim($originalTitle); |
||
485 | $titleHadHierarchicalSeparators = false; |
||
486 | |||
487 | /* |
||
488 | * If there's a separator in the title, first remove the final part |
||
489 | * |
||
490 | * Sanity warning: if you eval this match in PHPStorm's "Evaluate expression" box, it will return false |
||
491 | * I can assure you it works properly if you let the code run. |
||
492 | */ |
||
493 | if (preg_match('/ [\|\-\\\\\/>»] /i', $curTitle)) { |
||
494 | $titleHadHierarchicalSeparators = (bool) preg_match('/ [\\\\\/>»] /', $curTitle); |
||
495 | $curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle); |
||
496 | |||
497 | $this->logger->info(sprintf('[Metadata] Found hierarchical separators in title, new title is: \'%s\'', $curTitle)); |
||
498 | |||
499 | // If the resulting title is too short (3 words or fewer), remove |
||
500 | // the first part instead: |
||
501 | if (count(preg_split('/\s+/', $curTitle)) < 3) { |
||
502 | $curTitle = preg_replace('/[^\|\-\\\\\/>»]*[\|\-\\\\\/>»](.*)/i', '$1', $originalTitle); |
||
503 | $this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle)); |
||
504 | } |
||
505 | } elseif (strpos($curTitle, ': ') !== false) { |
||
506 | // Check if we have an heading containing this exact string, so we |
||
507 | // could assume it's the full title. |
||
508 | $match = false; |
||
509 | for ($i = 1; $i <= 2; $i++) { |
||
510 | foreach ($this->dom->getElementsByTagName('h'.$i) as $hTag) { |
||
511 | // Trim texts to avoid having false negatives when the title is surrounded by spaces or tabs |
||
512 | if (trim($hTag->nodeValue) === trim($curTitle)) { |
||
513 | $match = true; |
||
514 | } |
||
515 | } |
||
516 | } |
||
517 | |||
518 | // If we don't, let's extract the title out of the original title string. |
||
519 | if (!$match) { |
||
520 | $curTitle = substr($originalTitle, strrpos($originalTitle, ':') + 1); |
||
521 | |||
522 | $this->logger->info(sprintf('[Metadata] Title has a colon in the middle, new title is: \'%s\'', $curTitle)); |
||
523 | |||
524 | // If the title is now too short, try the first colon instead: |
||
525 | if (count(preg_split('/\s+/', $curTitle)) < 3) { |
||
526 | $curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1); |
||
527 | $this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle)); |
||
528 | } elseif (count(preg_split('/\s+/', substr($curTitle, 0, strpos($curTitle, ':')))) > 5) { |
||
529 | // But if we have too many words before the colon there's something weird |
||
530 | // with the titles and the H tags so let's just use the original title instead |
||
531 | $curTitle = $originalTitle; |
||
532 | } |
||
533 | } |
||
534 | } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) { |
||
535 | $hOnes = $this->dom->getElementsByTagName('h1'); |
||
536 | |||
537 | if ($hOnes->length === 1) { |
||
538 | $curTitle = $hOnes->item(0)->nodeValue; |
||
539 | $this->logger->info(sprintf('[Metadata] Using title from an H1 node: \'%s\'', $curTitle)); |
||
540 | } |
||
541 | } |
||
542 | |||
543 | $curTitle = trim($curTitle); |
||
544 | |||
545 | /* |
||
546 | * If we now have 4 words or fewer as our title, and either no |
||
547 | * 'hierarchical' separators (\, /, > or ») were found in the original |
||
548 | * title or we decreased the number of words by more than 1 word, use |
||
549 | * the original title. |
||
550 | */ |
||
551 | $curTitleWordCount = count(preg_split('/\s+/', $curTitle)); |
||
552 | $originalTitleWordCount = count(preg_split('/\s+/', preg_replace('/[\|\-\\\\\/>»]+/', '', $originalTitle))) - 1; |
||
553 | |||
554 | if ($curTitleWordCount <= 4 && |
||
555 | (!$titleHadHierarchicalSeparators || $curTitleWordCount !== $originalTitleWordCount)) { |
||
556 | $curTitle = $originalTitle; |
||
557 | |||
558 | $this->logger->info(sprintf('Using title from an H1 node: \'%s\'', $curTitle)); |
||
559 | } |
||
560 | |||
561 | return $curTitle; |
||
562 | } |
||
563 | |||
564 | /** |
||
565 | * Convert URI to an absolute URI. |
||
566 | * |
||
567 | * @param $uri string URI to convert |
||
568 | * |
||
569 | * @return string |
||
570 | */ |
||
571 | private function toAbsoluteURI($uri) |
||
572 | { |
||
573 | list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->configuration->getOriginalURL()); |
||
574 | |||
575 | // If this is already an absolute URI, return it. |
||
576 | if (preg_match('/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/', $uri)) { |
||
577 | return $uri; |
||
578 | } |
||
579 | |||
580 | // Scheme-rooted relative URI. |
||
581 | if (substr($uri, 0, 2) === '//') { |
||
582 | return $scheme.'://'.substr($uri, 2); |
||
583 | } |
||
584 | |||
585 | // Prepath-rooted relative URI. |
||
586 | if (substr($uri, 0, 1) === '/') { |
||
587 | return $prePath.$uri; |
||
588 | } |
||
589 | |||
590 | // Dotslash relative URI. |
||
591 | if (strpos($uri, './') === 0) { |
||
592 | return $pathBase.substr($uri, 2); |
||
593 | } |
||
594 | // Ignore hash URIs: |
||
595 | if (substr($uri, 0, 1) === '#') { |
||
596 | return $uri; |
||
597 | } |
||
598 | |||
599 | // Standard relative URI; add entire path. pathBase already includes a |
||
600 | // trailing "/". |
||
601 | return $pathBase.$uri; |
||
602 | } |
||
603 | |||
604 | /** |
||
605 | * Returns full path info of an URL. |
||
606 | * |
||
607 | * @param string $url |
||
608 | * |
||
609 | * @return array [$pathBase, $scheme, $prePath] |
||
610 | */ |
||
611 | public function getPathInfo($url) |
||
612 | { |
||
613 | // Check for base URLs |
||
614 | if ($this->dom->baseURI !== null) { |
||
615 | if (substr($this->dom->baseURI, 0, 1) === '/') { |
||
616 | // URLs starting with '/' override completely the URL defined in the link |
||
617 | $pathBase = parse_url($url, PHP_URL_SCHEME).'://'.parse_url($url, PHP_URL_HOST).$this->dom->baseURI; |
||
618 | } else { |
||
619 | // Otherwise just prepend the base to the actual path |
||
620 | $pathBase = parse_url($url, PHP_URL_SCHEME).'://'.parse_url($url, PHP_URL_HOST).dirname(parse_url($url, PHP_URL_PATH)).'/'.rtrim($this->dom->baseURI, '/').'/'; |
||
621 | } |
||
622 | } else { |
||
623 | $pathBase = parse_url($url, PHP_URL_SCHEME).'://'.parse_url($url, PHP_URL_HOST).dirname(parse_url($url, PHP_URL_PATH)).'/'; |
||
624 | } |
||
625 | |||
626 | $scheme = parse_url($pathBase, PHP_URL_SCHEME); |
||
627 | $prePath = $scheme.'://'.parse_url($pathBase, PHP_URL_HOST); |
||
628 | |||
629 | return [$pathBase, $scheme, $prePath]; |
||
630 | } |
||
631 | |||
632 | /** |
||
633 | * Gets nodes from the root element. |
||
634 | * |
||
635 | * @param $node DOMNode|DOMText |
||
636 | * |
||
637 | * @return array |
||
638 | */ |
||
639 | private function getNodes($node) |
||
640 | { |
||
641 | $this->logger->info('[Get Nodes] Retrieving nodes...'); |
||
642 | |||
643 | $stripUnlikelyCandidates = $this->configuration->getStripUnlikelyCandidates(); |
||
644 | |||
645 | $elementsToScore = []; |
||
646 | |||
647 | /* |
||
648 | * First, node prepping. Trash nodes that look cruddy (like ones with the |
||
649 | * class name "comment", etc), and turn divs into P tags where they have been |
||
650 | * used inappropriately (as in, where they contain no other block level elements.) |
||
651 | */ |
||
652 | |||
653 | while ($node) { |
||
654 | // Remove DOMComments nodes as we don't need them and mess up children counting |
||
655 | if ($node->nodeType === XML_COMMENT_NODE) { |
||
656 | $this->logger->debug(sprintf('[Get Nodes] Found comment node, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128))); |
||
657 | $node = NodeUtility::removeAndGetNext($node); |
||
658 | continue; |
||
659 | } |
||
660 | |||
661 | $matchString = $node->getAttribute('class').' '.$node->getAttribute('id'); |
||
662 | |||
663 | if (!$node->isProbablyVisible()) { |
||
664 | $this->logger->debug(sprintf('[Get Nodes] Removing hidden node... Match string was: \'%s\'', $matchString)); |
||
665 | $node = NodeUtility::removeAndGetNext($node); |
||
666 | continue; |
||
667 | } |
||
668 | |||
669 | // Check to see if this node is a byline, and remove it if it is. |
||
670 | if ($this->checkByline($node, $matchString)) { |
||
671 | $this->logger->debug(sprintf('[Get Nodes] Found byline, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128))); |
||
672 | $node = NodeUtility::removeAndGetNext($node); |
||
673 | continue; |
||
674 | } |
||
675 | |||
676 | // Remove unlikely candidates |
||
677 | if ($stripUnlikelyCandidates) { |
||
678 | if ( |
||
679 | preg_match(NodeUtility::$regexps['unlikelyCandidates'], $matchString) && |
||
680 | !preg_match(NodeUtility::$regexps['okMaybeItsACandidate'], $matchString) && |
||
681 | $node->nodeName !== 'body' && |
||
682 | $node->nodeName !== 'a' |
||
683 | ) { |
||
684 | $this->logger->debug(sprintf('[Get Nodes] Removing unlikely candidate. Node content was: \'%s\'', substr($node->nodeValue, 0, 128))); |
||
685 | $node = NodeUtility::removeAndGetNext($node); |
||
686 | continue; |
||
687 | } |
||
688 | } |
||
689 | |||
690 | // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). |
||
691 | if (($node->nodeName === 'div' || $node->nodeName === 'section' || $node->nodeName === 'header' || |
||
692 | $node->nodeName === 'h1' || $node->nodeName === 'h2' || $node->nodeName === 'h3' || |
||
693 | $node->nodeName === 'h4' || $node->nodeName === 'h5' || $node->nodeName === 'h6' || |
||
694 | $node->nodeName === 'p') && |
||
695 | $node->isElementWithoutContent()) { |
||
696 | $this->logger->debug(sprintf('[Get Nodes] Removing empty \'%s\' node.', $node->nodeName)); |
||
697 | $node = NodeUtility::removeAndGetNext($node); |
||
698 | continue; |
||
699 | } |
||
700 | |||
701 | if (in_array(strtolower($node->nodeName), $this->defaultTagsToScore)) { |
||
702 | $this->logger->debug(sprintf('[Get Nodes] Adding node to score list, node content is: \'%s\'', substr($node->nodeValue, 0, 128))); |
||
703 | $elementsToScore[] = $node; |
||
704 | } |
||
705 | |||
706 | // Turn all divs that don't have children block level elements into p's |
||
707 | if ($node->nodeName === 'div') { |
||
708 | // Put phrasing content into paragraphs. |
||
709 | $p = null; |
||
710 | $childNode = $node->firstChild; |
||
711 | while ($childNode) { |
||
712 | $nextSibling = $childNode->nextSibling; |
||
713 | if ($childNode->isPhrasingContent()) { |
||
714 | if ($p !== null) { |
||
715 | $p->appendChild($childNode); |
||
716 | } elseif (!$childNode->isWhitespace()) { |
||
717 | $p = $this->dom->createElement('p'); |
||
718 | $node->replaceChild($p, $childNode); |
||
719 | $p->appendChild($childNode); |
||
720 | } |
||
721 | } elseif ($p !== null) { |
||
722 | while ($p->lastChild && $p->lastChild->isWhitespace()) { |
||
723 | $p->removeChild($p->lastChild); |
||
724 | } |
||
725 | $p = null; |
||
726 | } |
||
727 | $childNode = $nextSibling; |
||
728 | } |
||
729 | |||
730 | /* |
||
731 | * Sites like http://mobile.slate.com encloses each paragraph with a DIV |
||
732 | * element. DIVs with only a P element inside and no text content can be |
||
733 | * safely converted into plain P elements to avoid confusing the scoring |
||
734 | * algorithm with DIVs with are, in practice, paragraphs. |
||
735 | */ |
||
736 | if ($node->hasSingleTagInsideElement('p') && $node->getLinkDensity() < 0.25) { |
||
737 | $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128))); |
||
738 | $pNode = NodeUtility::filterTextNodes($node->childNodes)->item(0); |
||
739 | $node->parentNode->replaceChild($pNode, $node); |
||
740 | $node = $pNode; |
||
741 | $elementsToScore[] = $node; |
||
742 | } elseif (!$node->hasSingleChildBlockElement()) { |
||
743 | $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single child block element, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue, 0, 128))); |
||
744 | $node = NodeUtility::setNodeTag($node, 'p'); |
||
745 | $elementsToScore[] = $node; |
||
746 | } |
||
747 | } |
||
748 | |||
749 | $node = NodeUtility::getNextNode($node); |
||
750 | } |
||
751 | |||
752 | return $elementsToScore; |
||
753 | } |
||
754 | |||
755 | /** |
||
756 | * Checks if the node is a byline. |
||
757 | * |
||
758 | * @param DOMNode $node |
||
759 | * @param string $matchString |
||
760 | * |
||
761 | * @return bool |
||
762 | */ |
||
763 | private function checkByline($node, $matchString) |
||
764 | { |
||
765 | if (!$this->configuration->getArticleByLine()) { |
||
766 | return false; |
||
767 | } |
||
768 | |||
769 | /* |
||
770 | * Check if the byline is already set |
||
771 | */ |
||
772 | if ($this->getAuthor()) { |
||
773 | return false; |
||
774 | } |
||
775 | |||
776 | $rel = $node->getAttribute('rel'); |
||
777 | |||
778 | if ($rel === 'author' || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) { |
||
779 | $this->logger->info(sprintf('[Metadata] Found article author: \'%s\'', $node->getTextContent())); |
||
780 | $this->setAuthor(trim($node->getTextContent())); |
||
781 | |||
782 | return true; |
||
783 | } |
||
784 | |||
785 | return false; |
||
786 | } |
||
787 | |||
788 | /** |
||
789 | * Checks the validity of a byLine. Based on string length. |
||
790 | * |
||
791 | * @param string $text |
||
792 | * |
||
793 | * @return bool |
||
794 | */ |
||
795 | private function isValidByline($text) |
||
796 | { |
||
797 | if (gettype($text) == 'string') { |
||
798 | $byline = trim($text); |
||
799 | |||
800 | return (mb_strlen($byline) > 0) && (mb_strlen($byline) < 100); |
||
801 | } |
||
802 | |||
803 | return false; |
||
804 | } |
||
805 | |||
806 | /** |
||
807 | * Removes all the scripts of the html. |
||
808 | * |
||
809 | * @param DOMDocument $dom |
||
810 | */ |
||
811 | private function removeScripts(DOMDocument $dom) |
||
812 | { |
||
813 | foreach (['script', 'noscript'] as $tag) { |
||
814 | $nodes = $dom->getElementsByTagName($tag); |
||
815 | foreach (iterator_to_array($nodes) as $node) { |
||
816 | NodeUtility::removeNode($node); |
||
817 | } |
||
818 | } |
||
819 | } |
||
820 | |||
821 | /** |
||
822 | * Prepares the document for parsing. |
||
823 | * |
||
824 | * @param DOMDocument $dom |
||
825 | */ |
||
826 | private function prepDocument(DOMDocument $dom) |
||
827 | { |
||
828 | $this->logger->info('[PrepDocument] Preparing document for parsing...'); |
||
829 | |||
830 | foreach ($dom->shiftingAwareGetElementsByTagName('br') as $br) { |
||
831 | $next = $br->nextSibling; |
||
832 | |||
833 | /* |
||
834 | * Whether 2 or more <br> elements have been found and replaced with a |
||
835 | * <p> block. |
||
836 | */ |
||
837 | $replaced = false; |
||
838 | |||
839 | /* |
||
840 | * If we find a <br> chain, remove the <br>s until we hit another element |
||
841 | * or non-whitespace. This leaves behind the first <br> in the chain |
||
842 | * (which will be replaced with a <p> later). |
||
843 | */ |
||
844 | while (($next = NodeUtility::nextElement($next)) && ($next->nodeName === 'br')) { |
||
845 | $this->logger->debug('[PrepDocument] Removing chain of BR nodes...'); |
||
846 | |||
847 | $replaced = true; |
||
848 | $brSibling = $next->nextSibling; |
||
849 | $next->parentNode->removeChild($next); |
||
850 | $next = $brSibling; |
||
851 | } |
||
852 | |||
853 | /* |
||
854 | * If we removed a <br> chain, replace the remaining <br> with a <p>. Add |
||
855 | * all sibling nodes as children of the <p> until we hit another <br> |
||
856 | * chain. |
||
857 | */ |
||
858 | |||
859 | if ($replaced) { |
||
860 | $p = $dom->createElement('p'); |
||
861 | $br->parentNode->replaceChild($p, $br); |
||
862 | |||
863 | $next = $p->nextSibling; |
||
864 | while ($next) { |
||
865 | // If we've hit another <br><br>, we're done adding children to this <p>. |
||
866 | if ($next->nodeName === 'br') { |
||
867 | $nextElem = NodeUtility::nextElement($next->nextSibling); |
||
868 | if ($nextElem && $nextElem->nodeName === 'br') { |
||
869 | break; |
||
870 | } |
||
871 | } |
||
872 | |||
873 | if (!$next->isPhrasingContent()) { |
||
874 | break; |
||
875 | } |
||
876 | |||
877 | $this->logger->debug('[PrepDocument] Replacing BR with a P node...'); |
||
878 | |||
879 | // Otherwise, make this node a child of the new <p>. |
||
880 | $sibling = $next->nextSibling; |
||
881 | $p->appendChild($next); |
||
882 | $next = $sibling; |
||
883 | } |
||
884 | |||
885 | while ($p->lastChild && $p->lastChild->isWhitespace()) { |
||
886 | $p->removeChild($p->lastChild); |
||
887 | } |
||
888 | |||
889 | if ($p->parentNode->tagName === 'p') { |
||
890 | NodeUtility::setNodeTag($p->parentNode, 'div'); |
||
891 | } |
||
892 | } |
||
893 | } |
||
894 | |||
895 | // Replace font tags with span |
||
896 | $fonts = $dom->getElementsByTagName('font'); |
||
897 | $length = $fonts->length; |
||
898 | for ($i = 0; $i < $length; $i++) { |
||
899 | $this->logger->debug('[PrepDocument] Converting font tag into a span tag.'); |
||
900 | $font = $fonts->item($length - 1 - $i); |
||
901 | NodeUtility::setNodeTag($font, 'span'); |
||
902 | } |
||
903 | } |
||
904 | |||
905 | /** |
||
906 | * Assign scores to each node. Returns full article parsed or false on error. |
||
907 | * |
||
908 | * @param array $nodes |
||
909 | * |
||
910 | * @return DOMDocument|bool |
||
911 | */ |
||
912 | private function rateNodes($nodes) |
||
913 | { |
||
914 | $this->logger->info('[Rating] Rating nodes...'); |
||
915 | |||
916 | $candidates = []; |
||
917 | |||
918 | /** @var DOMElement $node */ |
||
919 | foreach ($nodes as $node) { |
||
920 | if (is_null($node->parentNode)) { |
||
921 | continue; |
||
922 | } |
||
923 | |||
924 | // Discard nodes with less than 25 characters, without blank space |
||
925 | if (mb_strlen($node->getTextContent(true)) < 25) { |
||
926 | continue; |
||
927 | } |
||
928 | |||
929 | $ancestors = $node->getNodeAncestors(); |
||
930 | |||
931 | // Exclude nodes with no ancestor |
||
932 | if (count($ancestors) === 0) { |
||
933 | continue; |
||
934 | } |
||
935 | |||
936 | // Start with a point for the paragraph itself as a base. |
||
937 | $contentScore = 1; |
||
938 | |||
939 | // Add points for any commas within this paragraph. |
||
940 | $contentScore += count(explode(',', $node->getTextContent(true))); |
||
941 | |||
942 | // For every 100 characters in this paragraph, add another point. Up to 3 points. |
||
943 | $contentScore += min(floor(mb_strlen($node->getTextContent(true)) / 100), 3); |
||
944 | |||
945 | $this->logger->debug(sprintf('[Rating] Node score %s, content: \'%s\'', $contentScore, substr($node->nodeValue, 0, 128))); |
||
946 | |||
947 | /** @var $ancestor DOMElement */ |
||
948 | foreach ($ancestors as $level => $ancestor) { |
||
949 | $this->logger->debug('[Rating] Found ancestor, initializing and adding it as a candidate...'); |
||
950 | if (!$ancestor->isInitialized()) { |
||
951 | $ancestor->initializeNode($this->configuration->getWeightClasses()); |
||
952 | $candidates[] = $ancestor; |
||
953 | } |
||
954 | |||
955 | /* |
||
956 | * Node score divider: |
||
957 | * - parent: 1 (no division) |
||
958 | * - grandparent: 2 |
||
959 | * - great grandparent+: ancestor level * 3 |
||
960 | */ |
||
961 | |||
962 | if ($level === 0) { |
||
963 | $scoreDivider = 1; |
||
964 | } elseif ($level === 1) { |
||
965 | $scoreDivider = 2; |
||
966 | } else { |
||
967 | $scoreDivider = $level * 3; |
||
968 | } |
||
969 | |||
970 | $currentScore = $ancestor->contentScore; |
||
971 | $ancestor->contentScore = $currentScore + ($contentScore / $scoreDivider); |
||
972 | |||
973 | $this->logger->debug(sprintf('[Rating] Ancestor score %s, value: \'%s\'', $ancestor->contentScore, substr($ancestor->nodeValue, 0, 128))); |
||
974 | } |
||
975 | } |
||
976 | |||
977 | /* |
||
978 | * After we've calculated scores, loop through all of the possible |
||
979 | * candidate nodes we found and find the one with the highest score. |
||
980 | */ |
||
981 | |||
982 | $topCandidates = []; |
||
983 | foreach ($candidates as $candidate) { |
||
984 | |||
985 | /* |
||
986 | * Scale the final candidates score based on link density. Good content |
||
987 | * should have a relatively small link density (5% or less) and be mostly |
||
988 | * unaffected by this operation. |
||
989 | */ |
||
990 | |||
991 | $candidate->contentScore = $candidate->contentScore * (1 - $candidate->getLinkDensity()); |
||
992 | |||
993 | for ($i = 0; $i < $this->configuration->getMaxTopCandidates(); $i++) { |
||
994 | $aTopCandidate = isset($topCandidates[$i]) ? $topCandidates[$i] : null; |
||
995 | |||
996 | if (!$aTopCandidate || $candidate->contentScore > $aTopCandidate->contentScore) { |
||
997 | array_splice($topCandidates, $i, 0, [$candidate]); |
||
998 | if (count($topCandidates) > $this->configuration->getMaxTopCandidates()) { |
||
999 | array_pop($topCandidates); |
||
1000 | } |
||
1001 | break; |
||
1002 | } |
||
1003 | } |
||
1004 | } |
||
1005 | |||
1006 | $topCandidate = isset($topCandidates[0]) ? $topCandidates[0] : null; |
||
1007 | $parentOfTopCandidate = null; |
||
1008 | |||
1009 | /* |
||
1010 | * If we still have no top candidate, just use the body as a last resort. |
||
1011 | * We also have to copy the body node so it is something we can modify. |
||
1012 | */ |
||
1013 | |||
1014 | if ($topCandidate === null || $topCandidate->nodeName === 'body') { |
||
1015 | $this->logger->info('[Rating] No top candidate found or top candidate is the body tag. Moving all child nodes to a new DIV node.'); |
||
1016 | |||
1017 | // Move all of the page's children into topCandidate |
||
1018 | $topCandidate = new DOMDocument('1.0', 'utf-8'); |
||
1019 | $topCandidate->encoding = 'UTF-8'; |
||
1020 | $topCandidate->appendChild($topCandidate->createElement('div', '')); |
||
1021 | $kids = $this->dom->getElementsByTagName('body')->item(0)->childNodes; |
||
1022 | |||
1023 | // Cannot be foreached, don't ask me why. |
||
1024 | for ($i = 0; $i < $kids->length; $i++) { |
||
1025 | $import = $topCandidate->importNode($kids->item($i), true); |
||
1026 | $topCandidate->firstChild->appendChild($import); |
||
1027 | } |
||
1028 | |||
1029 | // Candidate must be created using firstChild to grab the DOMElement instead of the DOMDocument. |
||
1030 | $topCandidate = $topCandidate->firstChild; |
||
1031 | } elseif ($topCandidate) { |
||
1032 | $this->logger->info(sprintf('[Rating] Found top candidate, score: %s', $topCandidate->contentScore)); |
||
1033 | // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array |
||
1034 | // and whose scores are quite closed with current `topCandidate` node. |
||
1035 | $alternativeCandidateAncestors = []; |
||
1036 | for ($i = 1; $i < count($topCandidates); $i++) { |
||
1037 | // In some cases we may end up with a top candidate with zero content score. To avoid dividing by zero |
||
1038 | // we have to use max() and replace zero with a low value like 0.1 |
||
1039 | if ($topCandidates[$i]->contentScore / max($topCandidate->contentScore, 0.1) >= 0.75) { |
||
1040 | array_push($alternativeCandidateAncestors, $topCandidates[$i]->getNodeAncestors(false)); |
||
1041 | } |
||
1042 | } |
||
1043 | |||
1044 | $MINIMUM_TOPCANDIDATES = 3; |
||
1045 | if (count($alternativeCandidateAncestors) >= $MINIMUM_TOPCANDIDATES) { |
||
1046 | $parentOfTopCandidate = $topCandidate->parentNode; |
||
1047 | |||
1048 | // Check if we are actually dealing with a DOMNode and not a DOMDocument node or higher |
||
1049 | while ($parentOfTopCandidate->nodeName !== 'body' && $parentOfTopCandidate->nodeType === XML_ELEMENT_NODE) { |
||
1050 | $listsContainingThisAncestor = 0; |
||
1051 | for ($ancestorIndex = 0; $ancestorIndex < count($alternativeCandidateAncestors) && $listsContainingThisAncestor < $MINIMUM_TOPCANDIDATES; $ancestorIndex++) { |
||
1052 | $listsContainingThisAncestor += (int) in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex]); |
||
1053 | } |
||
1054 | if ($listsContainingThisAncestor >= $MINIMUM_TOPCANDIDATES) { |
||
1055 | $topCandidate = $parentOfTopCandidate; |
||
1056 | break; |
||
1057 | } |
||
1058 | $parentOfTopCandidate = $parentOfTopCandidate->parentNode; |
||
1059 | } |
||
1060 | } |
||
1061 | |||
1062 | /* |
||
1063 | * Because of our bonus system, parents of candidates might have scores |
||
1064 | * themselves. They get half of the node. There won't be nodes with higher |
||
1065 | * scores than our topCandidate, but if we see the score going *up* in the first |
||
1066 | * few steps up the tree, that's a decent sign that there might be more content |
||
1067 | * lurking in other places that we want to unify in. The sibling stuff |
||
1068 | * below does some of that - but only if we've looked high enough up the DOM |
||
1069 | * tree. |
||
1070 | */ |
||
1071 | |||
1072 | $parentOfTopCandidate = $topCandidate->parentNode; |
||
1073 | $lastScore = $topCandidate->contentScore; |
||
1074 | |||
1075 | // The scores shouldn't get too low. |
||
1076 | $scoreThreshold = $lastScore / 3; |
||
1077 | |||
1078 | /* @var DOMElement $parentOfTopCandidate */ |
||
1079 | while ($parentOfTopCandidate->nodeName !== 'body') { |
||
1080 | $parentScore = $parentOfTopCandidate->contentScore; |
||
1081 | if ($parentScore < $scoreThreshold) { |
||
1082 | break; |
||
1083 | } |
||
1084 | |||
1085 | if ($parentScore > $lastScore) { |
||
1086 | // Alright! We found a better parent to use. |
||
1087 | $topCandidate = $parentOfTopCandidate; |
||
1088 | $this->logger->info('[Rating] Found a better top candidate.'); |
||
1089 | break; |
||
1090 | } |
||
1091 | $lastScore = $parentOfTopCandidate->contentScore; |
||
1092 | $parentOfTopCandidate = $parentOfTopCandidate->parentNode; |
||
1093 | } |
||
1094 | |||
1095 | // If the top candidate is the only child, use parent instead. This will help sibling |
||
1096 | // joining logic when adjacent content is actually located in parent's sibling node. |
||
1097 | $parentOfTopCandidate = $topCandidate->parentNode; |
||
1098 | while ($parentOfTopCandidate->nodeName !== 'body' && count(NodeUtility::filterTextNodes($parentOfTopCandidate->childNodes)) === 1) { |
||
1099 | $topCandidate = $parentOfTopCandidate; |
||
1100 | $parentOfTopCandidate = $topCandidate->parentNode; |
||
1101 | } |
||
1102 | } |
||
1103 | |||
1104 | /* |
||
1105 | * Now that we have the top candidate, look through its siblings for content |
||
1106 | * that might also be related. Things like preambles, content split by ads |
||
1107 | * that we removed, etc. |
||
1108 | */ |
||
1109 | |||
1110 | $this->logger->info('[Rating] Creating final article content document...'); |
||
1111 | |||
1112 | $articleContent = new DOMDocument('1.0', 'utf-8'); |
||
1113 | $articleContent->createElement('div'); |
||
1114 | |||
1115 | $siblingScoreThreshold = max(10, $topCandidate->contentScore * 0.2); |
||
1116 | // Keep potential top candidate's parent node to try to get text direction of it later. |
||
1117 | $parentOfTopCandidate = $topCandidate->parentNode; |
||
1118 | $siblings = $parentOfTopCandidate->childNodes; |
||
1119 | |||
1120 | $hasContent = false; |
||
1121 | |||
1122 | $this->logger->info('[Rating] Adding top candidate siblings...'); |
||
1123 | |||
1124 | /* @var DOMElement $sibling */ |
||
1125 | // Can't foreach here because down there we might change the tag name and that causes the foreach to skip items |
||
1126 | for ($i = 0; $i < $siblings->length; $i++) { |
||
1127 | $sibling = $siblings[$i]; |
||
1128 | $append = false; |
||
1129 | |||
1130 | if ($sibling === $topCandidate) { |
||
1131 | $this->logger->debug('[Rating] Sibling is equal to the top candidate, adding to the final article...'); |
||
1132 | |||
1133 | $append = true; |
||
1134 | } else { |
||
1135 | $contentBonus = 0; |
||
1136 | |||
1137 | // Give a bonus if sibling nodes and top candidates have the example same classname |
||
1138 | if ($sibling->getAttribute('class') === $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') !== '') { |
||
1139 | $contentBonus += $topCandidate->contentScore * 0.2; |
||
1140 | } |
||
1141 | if ($sibling->contentScore + $contentBonus >= $siblingScoreThreshold) { |
||
1142 | $append = true; |
||
1143 | } elseif ($sibling->nodeName === 'p') { |
||
1144 | $linkDensity = $sibling->getLinkDensity(); |
||
1145 | $nodeContent = $sibling->getTextContent(true); |
||
1146 | |||
1147 | if (mb_strlen($nodeContent) > 80 && $linkDensity < 0.25) { |
||
1148 | $append = true; |
||
1149 | } elseif ($nodeContent && mb_strlen($nodeContent) < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) { |
||
1150 | $append = true; |
||
1151 | } |
||
1152 | } |
||
1153 | } |
||
1154 | |||
1155 | if ($append) { |
||
1156 | $this->logger->debug(sprintf('[Rating] Appending sibling to final article, content is: \'%s\'', substr($sibling->nodeValue, 0, 128))); |
||
1157 | |||
1158 | $hasContent = true; |
||
1159 | |||
1160 | if (!in_array(strtolower($sibling->nodeName), $this->alterToDIVExceptions)) { |
||
1161 | /* |
||
1162 | * We have a node that isn't a common block level element, like a form or td tag. |
||
1163 | * Turn it into a div so it doesn't get filtered out later by accident. |
||
1164 | */ |
||
1165 | $sibling = NodeUtility::setNodeTag($sibling, 'div'); |
||
1166 | } |
||
1167 | |||
1168 | $import = $articleContent->importNode($sibling, true); |
||
1169 | $articleContent->appendChild($import); |
||
1170 | |||
1171 | /* |
||
1172 | * No node shifting needs to be check because when calling getChildren, an array is made with the |
||
1173 | * children of the parent node, instead of using the DOMElement childNodes function, which, when used |
||
1174 | * along with appendChild, would shift the nodes position and the current foreach will behave in |
||
1175 | * unpredictable ways. |
||
1176 | */ |
||
1177 | } |
||
1178 | } |
||
1179 | |||
1180 | $articleContent = $this->prepArticle($articleContent); |
||
1181 | |||
1182 | if ($hasContent) { |
||
1183 | // Find out text direction from ancestors of final top candidate. |
||
1184 | $ancestors = array_merge([$parentOfTopCandidate, $topCandidate], $parentOfTopCandidate->getNodeAncestors()); |
||
1185 | foreach ($ancestors as $ancestor) { |
||
1186 | $articleDir = $ancestor->getAttribute('dir'); |
||
1187 | if ($articleDir) { |
||
1188 | $this->setDirection($articleDir); |
||
1189 | $this->logger->debug(sprintf('[Rating] Found article direction: %s', $articleDir)); |
||
1190 | break; |
||
1191 | } |
||
1192 | } |
||
1193 | |||
1194 | return $articleContent; |
||
1195 | } else { |
||
1196 | return false; |
||
1197 | } |
||
1198 | } |
||
1199 | |||
1200 | /** |
||
1201 | * Cleans up the final article. |
||
1202 | * |
||
1203 | * @param DOMDocument $article |
||
1204 | * |
||
1205 | * @return DOMDocument |
||
1206 | */ |
||
1207 | public function prepArticle(DOMDocument $article) |
||
1208 | { |
||
1209 | $this->logger->info('[PrepArticle] Preparing final article...'); |
||
1210 | |||
1211 | $this->_cleanStyles($article); |
||
1212 | $this->_clean($article, 'style'); |
||
1213 | |||
1214 | // Check for data tables before we continue, to avoid removing items in |
||
1215 | // those tables, which will often be isolated even though they're |
||
1216 | // visually linked to other content-ful elements (text, images, etc.). |
||
1217 | $this->_markDataTables($article); |
||
1218 | |||
1219 | // Clean out junk from the article content |
||
1220 | $this->_cleanConditionally($article, 'form'); |
||
1221 | $this->_cleanConditionally($article, 'fieldset'); |
||
1222 | $this->_clean($article, 'object'); |
||
1223 | $this->_clean($article, 'embed'); |
||
1224 | $this->_clean($article, 'h1'); |
||
1225 | $this->_clean($article, 'footer'); |
||
1226 | $this->_clean($article, 'link'); |
||
1227 | $this->_clean($article, 'aside'); |
||
1228 | |||
1229 | // Clean out elements have "share" in their id/class combinations from final top candidates, |
||
1230 | // which means we don't remove the top candidates even they have "share". |
||
1231 | foreach ($article->childNodes as $child) { |
||
1232 | $this->_cleanMatchedNodes($child, '/share/i'); |
||
1233 | } |
||
1234 | |||
1235 | /* |
||
1236 | * If there is only one h2 and its text content substantially equals article title, |
||
1237 | * they are probably using it as a header and not a subheader, |
||
1238 | * so remove it since we already extract the title separately. |
||
1239 | */ |
||
1240 | $h2 = $article->getElementsByTagName('h2'); |
||
1241 | if ($h2->length === 1) { |
||
1242 | $lengthSimilarRate = (mb_strlen($h2->item(0)->textContent) - mb_strlen($this->getTitle())) / max(mb_strlen($this->getTitle()), 1); |
||
1243 | |||
1244 | if (abs($lengthSimilarRate) < 0.5) { |
||
1245 | if ($lengthSimilarRate > 0) { |
||
1246 | $titlesMatch = strpos($h2->item(0)->textContent, $this->getTitle()) !== false; |
||
1247 | } else { |
||
1248 | $titlesMatch = strpos($this->getTitle(), $h2->item(0)->textContent) !== false; |
||
1249 | } |
||
1250 | if ($titlesMatch) { |
||
1251 | $this->logger->info('[PrepArticle] Found title repeated in an H2 node, removing...'); |
||
1252 | $this->_clean($article, 'h2'); |
||
1253 | } |
||
1254 | } |
||
1255 | } |
||
1256 | |||
1257 | $this->_clean($article, 'iframe'); |
||
1258 | $this->_clean($article, 'input'); |
||
1259 | $this->_clean($article, 'textarea'); |
||
1260 | $this->_clean($article, 'select'); |
||
1261 | $this->_clean($article, 'button'); |
||
1262 | $this->_cleanHeaders($article); |
||
1263 | |||
1264 | // Do these last as the previous stuff may have removed junk |
||
1265 | // that will affect these |
||
1266 | $this->_cleanConditionally($article, 'table'); |
||
1267 | $this->_cleanConditionally($article, 'ul'); |
||
1268 | $this->_cleanConditionally($article, 'div'); |
||
1269 | |||
1270 | $this->_cleanExtraParagraphs($article); |
||
1271 | |||
1272 | foreach (iterator_to_array($article->getElementsByTagName('br')) as $br) { |
||
1273 | $next = $br->nextSibling; |
||
1274 | if ($next && $next->nodeName === 'p') { |
||
1275 | $this->logger->debug('[PrepArticle] Removing br node next to a p node.'); |
||
1276 | $br->parentNode->removeChild($br); |
||
1277 | } |
||
1278 | } |
||
1279 | |||
1280 | // Remove single-cell tables |
||
1281 | foreach ($article->shiftingAwareGetElementsByTagName('table') as $table) { |
||
1282 | /** @var DOMNode $table */ |
||
1283 | $tbody = $table->hasSingleTagInsideElement('tbody') ? $table->getFirstElementChild() : $table; |
||
1284 | if ($tbody->hasSingleTagInsideElement('tr')) { |
||
1285 | $row = $tbody->getFirstElementChild(); |
||
1286 | if ($row->hasSingleTagInsideElement('td')) { |
||
1287 | $cell = $row->getFirstElementChild(); |
||
1288 | $cell = NodeUtility::setNodeTag($cell, (array_reduce(iterator_to_array($cell->childNodes), function($carry, $node) { |
||
1289 | return $node->isPhrasingContent() && $carry; |
||
1290 | }, true)) ? 'p' : 'div'); |
||
1291 | $table->parentNode->replaceChild($cell, $table); |
||
1292 | } |
||
1293 | } |
||
1294 | } |
||
1295 | |||
1296 | return $article; |
||
1297 | } |
||
1298 | |||
1299 | /** |
||
1300 | * Look for 'data' (as opposed to 'layout') tables, for which we use |
||
1301 | * similar checks as |
||
1302 | * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920. |
||
1303 | * |
||
1304 | * @param DOMDocument $article |
||
1305 | * |
||
1306 | * @return void |
||
1307 | */ |
||
1308 | public function _markDataTables(DOMDocument $article) |
||
1309 | { |
||
1310 | $tables = $article->getElementsByTagName('table'); |
||
1311 | foreach ($tables as $table) { |
||
1312 | /** @var DOMElement $table */ |
||
1313 | $role = $table->getAttribute('role'); |
||
1314 | if ($role === 'presentation') { |
||
1315 | $table->setReadabilityDataTable(false); |
||
1316 | continue; |
||
1317 | } |
||
1318 | $datatable = $table->getAttribute('datatable'); |
||
1319 | if ($datatable == '0') { |
||
1320 | $table->setReadabilityDataTable(false); |
||
1321 | continue; |
||
1322 | } |
||
1323 | $summary = $table->getAttribute('summary'); |
||
1324 | if ($summary) { |
||
1325 | $table->setReadabilityDataTable(true); |
||
1326 | continue; |
||
1327 | } |
||
1328 | |||
1329 | $caption = $table->getElementsByTagName('caption'); |
||
1330 | if ($caption->length > 0 && $caption->item(0)->childNodes->length > 0) { |
||
1331 | $table->setReadabilityDataTable(true); |
||
1332 | continue; |
||
1333 | } |
||
1334 | |||
1335 | // If the table has a descendant with any of these tags, consider a data table: |
||
1336 | foreach (['col', 'colgroup', 'tfoot', 'thead', 'th'] as $dataTableDescendants) { |
||
1337 | if ($table->getElementsByTagName($dataTableDescendants)->length > 0) { |
||
1338 | $table->setReadabilityDataTable(true); |
||
1339 | continue 2; |
||
1340 | } |
||
1341 | } |
||
1342 | |||
1343 | // Nested tables indicate a layout table: |
||
1344 | if ($table->getElementsByTagName('table')->length > 0) { |
||
1345 | $table->setReadabilityDataTable(false); |
||
1346 | continue; |
||
1347 | } |
||
1348 | |||
1349 | $sizeInfo = $table->getRowAndColumnCount(); |
||
1350 | if ($sizeInfo['rows'] >= 10 || $sizeInfo['columns'] > 4) { |
||
1351 | $table->setReadabilityDataTable(true); |
||
1352 | continue; |
||
1353 | } |
||
1354 | // Now just go by size entirely: |
||
1355 | $table->setReadabilityDataTable($sizeInfo['rows'] * $sizeInfo['columns'] > 10); |
||
1356 | } |
||
1357 | } |
||
1358 | |||
1359 | /** |
||
1360 | * Remove the style attribute on every e and under. |
||
1361 | * |
||
1362 | * @param $node DOMDocument|DOMNode |
||
1363 | **/ |
||
1364 | public function _cleanStyles($node) |
||
1365 | { |
||
1366 | if (property_exists($node, 'tagName') && $node->tagName === 'svg') { |
||
1367 | return; |
||
1368 | } |
||
1369 | |||
1370 | // Do not bother if there's no method to remove an attribute |
||
1371 | if (method_exists($node, 'removeAttribute')) { |
||
1372 | $presentational_attributes = ['align', 'background', 'bgcolor', 'border', 'cellpadding', 'cellspacing', 'frame', 'hspace', 'rules', 'style', 'valign', 'vspace']; |
||
1373 | // Remove `style` and deprecated presentational attributes |
||
1374 | foreach ($presentational_attributes as $presentational_attribute) { |
||
1375 | $node->removeAttribute($presentational_attribute); |
||
1376 | } |
||
1377 | |||
1378 | $deprecated_size_attribute_elems = ['table', 'th', 'td', 'hr', 'pre']; |
||
1379 | if (property_exists($node, 'tagName') && in_array($node->tagName, $deprecated_size_attribute_elems)) { |
||
1380 | $node->removeAttribute('width'); |
||
1381 | $node->removeAttribute('height'); |
||
1382 | } |
||
1383 | } |
||
1384 | |||
1385 | $cur = $node->firstChild; |
||
1386 | while ($cur !== null) { |
||
1387 | $this->_cleanStyles($cur); |
||
1388 | $cur = $cur->nextSibling; |
||
1389 | } |
||
1390 | } |
||
1391 | |||
1392 | /** |
||
1393 | * Clean out elements whose id/class combinations match specific string. |
||
1394 | * |
||
1395 | * @param $node DOMElement Node to clean |
||
1396 | * @param $regex string Match id/class combination. |
||
1397 | * |
||
1398 | * @return void |
||
1399 | **/ |
||
1400 | public function _cleanMatchedNodes($node, $regex) |
||
1401 | { |
||
1402 | $endOfSearchMarkerNode = NodeUtility::getNextNode($node, true); |
||
1403 | $next = NodeUtility::getNextNode($node); |
||
1404 | while ($next && $next !== $endOfSearchMarkerNode) { |
||
1405 | if (preg_match($regex, sprintf('%s %s', $next->getAttribute('class'), $next->getAttribute('id')))) { |
||
1406 | $this->logger->debug(sprintf('Removing matched node with regex: \'%s\', node class was: \'%s\', id: \'%s\'', $regex, $next->getAttribute('class'), $next->getAttribute('id'))); |
||
1407 | $next = NodeUtility::removeAndGetNext($next); |
||
1408 | } else { |
||
1409 | $next = NodeUtility::getNextNode($next); |
||
1410 | } |
||
1411 | } |
||
1412 | } |
||
1413 | |||
1414 | /** |
||
1415 | * @param DOMDocument $article |
||
1416 | * |
||
1417 | * @return void |
||
1418 | */ |
||
1419 | public function _cleanExtraParagraphs(DOMDocument $article) |
||
1420 | { |
||
1421 | $paragraphs = $article->getElementsByTagName('p'); |
||
1422 | $length = $paragraphs->length; |
||
1423 | |||
1424 | for ($i = 0; $i < $length; $i++) { |
||
1425 | $paragraph = $paragraphs->item($length - 1 - $i); |
||
1426 | |||
1427 | $imgCount = $paragraph->getElementsByTagName('img')->length; |
||
1428 | $embedCount = $paragraph->getElementsByTagName('embed')->length; |
||
1429 | $objectCount = $paragraph->getElementsByTagName('object')->length; |
||
1430 | // At this point, nasty iframes have been removed, only remain embedded video ones. |
||
1431 | $iframeCount = $paragraph->getElementsByTagName('iframe')->length; |
||
1432 | $totalCount = $imgCount + $embedCount + $objectCount + $iframeCount; |
||
1433 | |||
1434 | if ($totalCount === 0 && !preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $paragraph->textContent)) { |
||
1435 | $this->logger->debug(sprintf('[PrepArticle] Removing extra paragraph. Text content was: \'%s\'', substr($paragraph->textContent, 0, 128))); |
||
1436 | $paragraph->parentNode->removeChild($paragraph); |
||
1437 | } |
||
1438 | } |
||
1439 | } |
||
1440 | |||
1441 | /** |
||
1442 | * @param DOMDocument $article |
||
1443 | * @param string $tag Tag to clean conditionally |
||
1444 | * |
||
1445 | * @return void |
||
1446 | */ |
||
1447 | public function _cleanConditionally(DOMDocument $article, $tag) |
||
1448 | { |
||
1449 | if (!$this->configuration->getCleanConditionally()) { |
||
1450 | return; |
||
1451 | } |
||
1452 | |||
1453 | $isList = in_array($tag, ['ul', 'ol']); |
||
1454 | |||
1455 | /* |
||
1456 | * Gather counts for other typical elements embedded within. |
||
1457 | * Traverse backwards so we can remove nodes at the same time |
||
1458 | * without effecting the traversal. |
||
1459 | */ |
||
1460 | |||
1461 | $DOMNodeList = $article->getElementsByTagName($tag); |
||
1462 | $length = $DOMNodeList->length; |
||
1463 | for ($i = 0; $i < $length; $i++) { |
||
1464 | /** @var $node DOMElement */ |
||
1465 | $node = $DOMNodeList->item($length - 1 - $i); |
||
1466 | |||
1467 | // First check if we're in a data table, in which case don't remove us. |
||
1468 | if ($node->hasAncestorTag('table', -1, function($node) { |
||
1469 | return $node->isReadabilityDataTable(); |
||
1470 | })) { |
||
1471 | continue; |
||
1472 | } |
||
1473 | |||
1474 | $weight = 0; |
||
1475 | if ($this->configuration->getWeightClasses()) { |
||
1476 | $weight = $node->getClassWeight(); |
||
1477 | } |
||
1478 | |||
1479 | if ($weight < 0) { |
||
1480 | $this->logger->debug(sprintf('[PrepArticle] Removing tag \'%s\' with 0 or less weight', $tag)); |
||
1481 | |||
1482 | NodeUtility::removeNode($node); |
||
1483 | continue; |
||
1484 | } |
||
1485 | |||
1486 | if (substr_count($node->getTextContent(), ',') < 10) { |
||
1487 | /* |
||
1488 | * If there are not very many commas, and the number of |
||
1489 | * non-paragraph elements is more than paragraphs or other |
||
1490 | * ominous signs, remove the element. |
||
1491 | */ |
||
1492 | |||
1493 | $p = $node->getElementsByTagName('p')->length; |
||
1494 | $img = $node->getElementsByTagName('img')->length; |
||
1495 | $li = $node->getElementsByTagName('li')->length - 100; |
||
1496 | $input = $node->getElementsByTagName('input')->length; |
||
1497 | |||
1498 | $embedCount = 0; |
||
1499 | $embeds = $node->getElementsByTagName('embed'); |
||
1500 | |||
1501 | foreach ($embeds as $embedNode) { |
||
1502 | if (preg_match(NodeUtility::$regexps['videos'], $embedNode->C14N())) { |
||
1503 | $embedCount++; |
||
1504 | } |
||
1505 | } |
||
1506 | |||
1507 | $linkDensity = $node->getLinkDensity(); |
||
1508 | $contentLength = mb_strlen($node->getTextContent(true)); |
||
1509 | |||
1510 | $haveToRemove = |
||
1511 | ($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag('figure')) || |
||
1512 | (!$isList && $li > $p) || |
||
1513 | ($input > floor($p / 3)) || |
||
1514 | (!$isList && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag('figure')) || |
||
1515 | (!$isList && $weight < 25 && $linkDensity > 0.2) || |
||
1516 | ($weight >= 25 && $linkDensity > 0.5) || |
||
1517 | (($embedCount === 1 && $contentLength < 75) || $embedCount > 1); |
||
1518 | |||
1519 | if ($haveToRemove) { |
||
1520 | $this->logger->debug(sprintf('[PrepArticle] Removing tag \'%s\'.', $tag)); |
||
1521 | |||
1522 | NodeUtility::removeNode($node); |
||
1523 | } |
||
1524 | } |
||
1525 | } |
||
1526 | } |
||
1527 | |||
1528 | /** |
||
1529 | * Clean a node of all elements of type "tag". |
||
1530 | * (Unless it's a youtube/vimeo video. People love movies.). |
||
1531 | * |
||
1532 | * @param $article DOMDocument |
||
1533 | * @param $tag string tag to clean |
||
1534 | * |
||
1535 | * @return void |
||
1536 | **/ |
||
1537 | public function _clean(DOMDocument $article, $tag) |
||
1538 | { |
||
1539 | $isEmbed = in_array($tag, ['object', 'embed', 'iframe']); |
||
1540 | |||
1541 | $DOMNodeList = $article->getElementsByTagName($tag); |
||
1542 | $length = $DOMNodeList->length; |
||
1543 | for ($i = 0; $i < $length; $i++) { |
||
1544 | $item = $DOMNodeList->item($length - 1 - $i); |
||
1545 | |||
1546 | // Allow youtube and vimeo videos through as people usually want to see those. |
||
1547 | if ($isEmbed) { |
||
1548 | $attributeValues = []; |
||
1549 | foreach ($item->attributes as $value) { |
||
1550 | $attributeValues[] = $value->nodeValue; |
||
1551 | } |
||
1552 | $attributeValues = implode('|', $attributeValues); |
||
1553 | |||
1554 | // First, check the elements attributes to see if any of them contain youtube or vimeo |
||
1555 | if (preg_match(NodeUtility::$regexps['videos'], $attributeValues)) { |
||
1556 | continue; |
||
1557 | } |
||
1558 | |||
1559 | // Then check the elements inside this element for the same. |
||
1560 | if (preg_match(NodeUtility::$regexps['videos'], $item->C14N())) { |
||
1561 | continue; |
||
1562 | } |
||
1563 | } |
||
1564 | $this->logger->debug(sprintf('[PrepArticle] Removing node \'%s\'.', $item->tagName)); |
||
1565 | |||
1566 | NodeUtility::removeNode($item); |
||
1567 | } |
||
1568 | } |
||
1569 | |||
1570 | /** |
||
1571 | * Clean out spurious headers from an Element. Checks things like classnames and link density. |
||
1572 | * |
||
1573 | * @param DOMDocument $article |
||
1574 | * |
||
1575 | * @return void |
||
1576 | **/ |
||
1577 | public function _cleanHeaders(DOMDocument $article) |
||
1578 | { |
||
1579 | for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { |
||
1580 | $headers = $article->getElementsByTagName('h'.$headerIndex); |
||
1581 | /** @var $header DOMElement */ |
||
1582 | foreach ($headers as $header) { |
||
1583 | $weight = 0; |
||
1584 | if ($this->configuration->getWeightClasses()) { |
||
1585 | $weight = $header->getClassWeight(); |
||
1586 | } |
||
1587 | |||
1588 | if ($weight < 0) { |
||
1589 | $this->logger->debug(sprintf('[PrepArticle] Removing H node with 0 or less weight. Content was: \'%s\'', substr($header->nodeValue, 0, 128))); |
||
1590 | |||
1591 | NodeUtility::removeNode($header); |
||
1592 | } |
||
1593 | } |
||
1594 | } |
||
1595 | } |
||
1596 | |||
1597 | /** |
||
1598 | * Removes the class="" attribute from every element in the given |
||
1599 | * subtree. |
||
1600 | * |
||
1601 | * Readability.js has a special filter to avoid cleaning the classes that the algorithm adds. We don't add classes |
||
1602 | * here so no need to filter those. |
||
1603 | * |
||
1604 | * @param DOMDocument|DOMNode $node |
||
1605 | * |
||
1606 | * @return void |
||
1607 | **/ |
||
1608 | public function _cleanClasses($node) |
||
1609 | { |
||
1610 | if ($node->getAttribute('class') !== '') { |
||
1611 | $node->removeAttribute('class'); |
||
1612 | } |
||
1613 | |||
1614 | for ($node = $node->getFirstElementChild(); $node !== null; $node = $node->nextSibling) { |
||
1615 | $this->_cleanClasses($node); |
||
1616 | } |
||
1617 | } |
||
1618 | |||
1619 | /** |
||
1620 | * @param DOMDocument $article |
||
1621 | * |
||
1622 | * @return DOMDocument |
||
1623 | */ |
||
1624 | public function postProcessContent(DOMDocument $article) |
||
1625 | { |
||
1626 | $this->logger->info('[PostProcess] PostProcessing content...'); |
||
1627 | |||
1628 | // Readability cannot open relative uris so we convert them to absolute uris. |
||
1629 | if ($this->configuration->getFixRelativeURLs()) { |
||
1630 | foreach (iterator_to_array($article->getElementsByTagName('a')) as $link) { |
||
1631 | /** @var DOMElement $link */ |
||
1632 | $href = $link->getAttribute('href'); |
||
1633 | if ($href) { |
||
1634 | // Replace links with javascript: URIs with text content, since |
||
1635 | // they won't work after scripts have been removed from the page. |
||
1636 | if (strpos($href, 'javascript:') === 0) { |
||
1637 | $this->logger->debug(sprintf('[PostProcess] Removing \'javascript:\' link. Content is: \'%s\'', substr($link->textContent, 0, 128))); |
||
1638 | |||
1639 | $text = $article->createTextNode($link->textContent); |
||
1640 | $link->parentNode->replaceChild($text, $link); |
||
1641 | } else { |
||
1642 | $this->logger->debug(sprintf('[PostProcess] Converting link to absolute URI: \'%s\'', substr($href, 0, 128))); |
||
1643 | |||
1644 | $link->setAttribute('href', $this->toAbsoluteURI($href)); |
||
1645 | } |
||
1646 | } |
||
1647 | } |
||
1648 | |||
1649 | foreach ($article->getElementsByTagName('img') as $img) { |
||
1650 | /** @var DOMElement $img */ |
||
1651 | /* |
||
1652 | * Extract all possible sources of img url and select the first one on the list. |
||
1653 | */ |
||
1654 | $url = [ |
||
1655 | $img->getAttribute('src'), |
||
1656 | $img->getAttribute('data-src'), |
||
1657 | $img->getAttribute('data-original'), |
||
1658 | $img->getAttribute('data-orig'), |
||
1659 | $img->getAttribute('data-url') |
||
1660 | ]; |
||
1661 | |||
1662 | $src = array_filter($url); |
||
1663 | $src = reset($src); |
||
1664 | if ($src) { |
||
1665 | $this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($src, 0, 128))); |
||
1666 | |||
1667 | $img->setAttribute('src', $this->toAbsoluteURI($src)); |
||
1668 | } |
||
1669 | } |
||
1670 | } |
||
1671 | |||
1672 | $this->_cleanClasses($article); |
||
1673 | |||
1674 | return $article; |
||
1675 | } |
||
1676 | |||
1677 | /** |
||
1678 | * @return null|string |
||
1679 | */ |
||
1680 | public function __toString() |
||
1681 | { |
||
1682 | return sprintf('<h1>%s</h1>%s', $this->getTitle(), $this->getContent()); |
||
1683 | } |
||
1684 | |||
1685 | /** |
||
1686 | * @return string|null |
||
1687 | */ |
||
1688 | public function getTitle() |
||
1689 | { |
||
1690 | return $this->title; |
||
1691 | } |
||
1692 | |||
1693 | /** |
||
1694 | * @param string $title |
||
1695 | */ |
||
1696 | protected function setTitle($title) |
||
1697 | { |
||
1698 | $this->title = $title; |
||
1699 | } |
||
1700 | |||
1701 | /** |
||
1702 | * @return string|null |
||
1703 | */ |
||
1704 | public function getContent() |
||
1705 | { |
||
1706 | return ($this->content instanceof DOMDocument) ? $this->content->C14N() : null; |
||
1707 | } |
||
1708 | |||
1709 | /** |
||
1710 | * @return DOMDocument|null |
||
1711 | */ |
||
1712 | public function getDOMDocument() |
||
1713 | { |
||
1714 | return $this->content; |
||
1715 | } |
||
1716 | |||
1717 | /** |
||
1718 | * @param DOMDocument $content |
||
1719 | */ |
||
1720 | protected function setContent(DOMDocument $content) |
||
1721 | { |
||
1722 | $this->content = $content; |
||
1723 | } |
||
1724 | |||
1725 | /** |
||
1726 | * @return null|string |
||
1727 | */ |
||
1728 | public function getExcerpt() |
||
1729 | { |
||
1730 | return $this->excerpt; |
||
1731 | } |
||
1732 | |||
1733 | /** |
||
1734 | * @param null|string $excerpt |
||
1735 | */ |
||
1736 | public function setExcerpt($excerpt) |
||
1737 | { |
||
1738 | $this->excerpt = $excerpt; |
||
1739 | } |
||
1740 | |||
1741 | /** |
||
1742 | * @return string|null |
||
1743 | */ |
||
1744 | public function getImage() |
||
1745 | { |
||
1746 | return $this->image; |
||
1747 | } |
||
1748 | |||
1749 | /** |
||
1750 | * @param string $image |
||
1751 | */ |
||
1752 | protected function setImage($image) |
||
1755 | } |
||
1756 | |||
1757 | /** |
||
1758 | * @return string|null |
||
1759 | */ |
||
1760 | public function getAuthor() |
||
1761 | { |
||
1762 | return $this->author; |
||
1763 | } |
||
1764 | |||
1765 | /** |
||
1766 | * @param string $author |
||
1767 | */ |
||
1768 | protected function setAuthor($author) |
||
1769 | { |
||
1770 | $this->author = $author; |
||
1771 | } |
||
1772 | |||
1773 | /** |
||
1774 | * @return string|null |
||
1775 | */ |
||
1776 | public function getSiteName() |
||
1777 | { |
||
1778 | return $this->siteName; |
||
1779 | } |
||
1780 | |||
1781 | /** |
||
1782 | * @param string $siteName |
||
1783 | */ |
||
1784 | protected function setSiteName($siteName) |
||
1785 | { |
||
1786 | $this->siteName = $siteName; |
||
1787 | } |
||
1788 | |||
1789 | /** |
||
1790 | * @return null|string |
||
1791 | */ |
||
1792 | public function getDirection() |
||
1795 | } |
||
1796 | |||
1797 | /** |
||
1798 | * @param null|string $direction |
||
1799 | */ |
||
1800 | public function setDirection($direction) |
||
1801 | { |
||
1802 | $this->direction = $direction; |
||
1803 | } |
||
1804 | } |
||
1805 |