| Total Complexity | 295 |
| Total Lines | 1788 |
| Duplicated Lines | 0 % |
| Changes | 2 | ||
| Bugs | 0 | Features | 1 |
Complex classes like Readability often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Readability, and based on these observations, apply Extract Interface, too.
| 1 | <?php |
||
| 15 | class Readability |
||
| 16 | { |
||
| 17 | /** |
||
| 18 | * Main DOMDocument where all the magic happens. |
||
| 19 | * |
||
| 20 | * @var DOMDocument |
||
| 21 | */ |
||
| 22 | protected $dom; |
||
| 23 | |||
| 24 | /** |
||
| 25 | * Title of the article. |
||
| 26 | * |
||
| 27 | * @var string|null |
||
| 28 | */ |
||
| 29 | protected $title = null; |
||
| 30 | |||
| 31 | /** |
||
| 32 | * Final DOMDocument with the fully parsed HTML. |
||
| 33 | * |
||
| 34 | * @var DOMDocument|null |
||
| 35 | */ |
||
| 36 | protected $content = null; |
||
| 37 | |||
| 38 | /** |
||
| 39 | * Excerpt of the article. |
||
| 40 | * |
||
| 41 | * @var string|null |
||
| 42 | */ |
||
| 43 | protected $excerpt = null; |
||
| 44 | |||
| 45 | /** |
||
| 46 | * Main image of the article. |
||
| 47 | * |
||
| 48 | * @var string|null |
||
| 49 | */ |
||
| 50 | protected $image = null; |
||
| 51 | |||
| 52 | /** |
||
| 53 | * Author of the article. Extracted from the byline tags and other social media properties. |
||
| 54 | * |
||
| 55 | * @var string|null |
||
| 56 | */ |
||
| 57 | protected $author = null; |
||
| 58 | |||
| 59 | /** |
||
| 60 | * Website name. |
||
| 61 | * |
||
| 62 | * @var string|null |
||
| 63 | */ |
||
| 64 | protected $siteName = null; |
||
| 65 | |||
| 66 | /** |
||
| 67 | * Direction of the text. |
||
| 68 | * |
||
| 69 | * @var string|null |
||
| 70 | */ |
||
| 71 | protected $direction = null; |
||
| 72 | |||
| 73 | /** |
||
| 74 | * Configuration object. |
||
| 75 | * |
||
| 76 | * @var Configuration |
||
| 77 | */ |
||
| 78 | private $configuration; |
||
| 79 | |||
| 80 | /** |
||
| 81 | * Logger object. |
||
| 82 | * |
||
| 83 | * @var LoggerInterface |
||
| 84 | */ |
||
| 85 | private $logger; |
||
| 86 | |||
| 87 | /** |
||
| 88 | * Collection of attempted text extractions. |
||
| 89 | * |
||
| 90 | * @var array |
||
| 91 | */ |
||
| 92 | private $attempts = []; |
||
| 93 | |||
| 94 | /** |
||
| 95 | * @var array |
||
| 96 | */ |
||
| 97 | private $defaultTagsToScore = [ |
||
| 98 | 'section', |
||
| 99 | 'h2', |
||
| 100 | 'h3', |
||
| 101 | 'h4', |
||
| 102 | 'h5', |
||
| 103 | 'h6', |
||
| 104 | 'p', |
||
| 105 | 'td', |
||
| 106 | 'pre', |
||
| 107 | ]; |
||
| 108 | |||
| 109 | /** |
||
| 110 | * @var array |
||
| 111 | */ |
||
| 112 | private $alterToDIVExceptions = [ |
||
| 113 | 'div', |
||
| 114 | 'article', |
||
| 115 | 'section', |
||
| 116 | 'p', |
||
| 117 | ]; |
||
| 118 | |||
| 119 | /** |
||
| 120 | * Readability constructor. |
||
| 121 | * |
||
| 122 | * @param Configuration $configuration |
||
| 123 | */ |
||
| 124 | public function __construct(Configuration $configuration) |
||
| 125 | { |
||
| 126 | $this->configuration = $configuration; |
||
| 127 | $this->logger = $this->configuration->getLogger(); |
||
| 128 | } |
||
| 129 | |||
| 130 | /** |
||
| 131 | * Main parse function. |
||
| 132 | * |
||
| 133 | * @param $html |
||
| 134 | * |
||
| 135 | * @throws ParseException |
||
| 136 | * |
||
| 137 | * @return bool |
||
| 138 | */ |
||
| 139 | public function parse($html) |
||
| 140 | { |
||
| 141 | $this->logger->info('*** Starting parse process...'); |
||
| 142 | |||
| 143 | $this->dom = $this->loadHTML($html); |
||
| 144 | |||
| 145 | // Checking for minimum HTML to work with. |
||
| 146 | if (!($root = $this->dom->getElementsByTagName('body')->item(0)) || !$root->firstChild) { |
||
| 147 | $this->logger->emergency('No body tag present or body tag empty'); |
||
| 148 | |||
| 149 | throw new ParseException('Invalid or incomplete HTML.'); |
||
| 150 | } |
||
| 151 | |||
| 152 | $this->getMetadata(); |
||
| 153 | |||
| 154 | $this->getMainImage(); |
||
| 155 | |||
| 156 | while (true) { |
||
| 157 | $root = $root->firstChild; |
||
| 158 | |||
| 159 | $elementsToScore = $this->getNodes($root); |
||
| 160 | $this->logger->debug(sprintf('Elements to score: \'%s\'', count($elementsToScore))); |
||
| 161 | |||
| 162 | $result = $this->rateNodes($elementsToScore); |
||
| 163 | |||
| 164 | /* |
||
| 165 | * Now that we've gone through the full algorithm, check to see if |
||
| 166 | * we got any meaningful content. If we didn't, we may need to re-run |
||
| 167 | * grabArticle with different flags set. This gives us a higher likelihood of |
||
| 168 | * finding the content, and the sieve approach gives us a higher likelihood of |
||
| 169 | * finding the -right- content. |
||
| 170 | */ |
||
| 171 | |||
| 172 | $length = mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->textContent)); |
||
| 173 | |||
| 174 | $this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getCharThreshold())); |
||
| 175 | |||
| 176 | if ($result && $length < $this->configuration->getCharThreshold()) { |
||
| 177 | $this->dom = $this->loadHTML($html); |
||
| 178 | $root = $this->dom->getElementsByTagName('body')->item(0); |
||
| 179 | |||
| 180 | if ($this->configuration->getStripUnlikelyCandidates()) { |
||
| 181 | $this->logger->debug('[Parsing] Threshold not met, trying again setting StripUnlikelyCandidates as false'); |
||
| 182 | $this->configuration->setStripUnlikelyCandidates(false); |
||
| 183 | $this->attempts[] = ['articleContent' => $result, 'textLength' => $length]; |
||
| 184 | } elseif ($this->configuration->getWeightClasses()) { |
||
| 185 | $this->logger->debug('[Parsing] Threshold not met, trying again setting WeightClasses as false'); |
||
| 186 | $this->configuration->setWeightClasses(false); |
||
| 187 | $this->attempts[] = ['articleContent' => $result, 'textLength' => $length]; |
||
| 188 | } elseif ($this->configuration->getCleanConditionally()) { |
||
| 189 | $this->logger->debug('[Parsing] Threshold not met, trying again setting CleanConditionally as false'); |
||
| 190 | $this->configuration->setCleanConditionally(false); |
||
| 191 | $this->attempts[] = ['articleContent' => $result, 'textLength' => $length]; |
||
| 192 | } else { |
||
| 193 | $this->logger->debug('[Parsing] Threshold not met, searching across attempts for some content.'); |
||
| 194 | $this->attempts[] = ['articleContent' => $result, 'textLength' => $length]; |
||
| 195 | |||
| 196 | // No luck after removing flags, just return the longest text we found during the different loops |
||
| 197 | usort($this->attempts, function($a, $b) { |
||
| 198 | return $a['textLength'] < $b['textLength']; |
||
| 199 | }); |
||
| 200 | |||
| 201 | // But first check if we actually have something |
||
| 202 | if (!$this->attempts[0]['textLength']) { |
||
| 203 | $this->logger->emergency('[Parsing] Could not parse text, giving up :('); |
||
| 204 | |||
| 205 | throw new ParseException('Could not parse text.'); |
||
| 206 | } |
||
| 207 | |||
| 208 | $this->logger->debug('[Parsing] Threshold not met, but found some content in previous attempts.'); |
||
| 209 | |||
| 210 | $result = $this->attempts[0]['articleContent']; |
||
| 211 | break; |
||
| 212 | } |
||
| 213 | } else { |
||
| 214 | break; |
||
| 215 | } |
||
| 216 | } |
||
| 217 | |||
| 218 | $result = $this->postProcessContent($result); |
||
|
|
|||
| 219 | |||
| 220 | // If we haven't found an excerpt in the article's metadata, use the article's |
||
| 221 | // first paragraph as the excerpt. This can be used for displaying a preview of |
||
| 222 | // the article's content. |
||
| 223 | if (!$this->getExcerpt()) { |
||
| 224 | $this->logger->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.'); |
||
| 225 | $paragraphs = $result->getElementsByTagName('p'); |
||
| 226 | if ($paragraphs->length > 0) { |
||
| 227 | $this->setExcerpt(trim($paragraphs->item(0)->textContent)); |
||
| 228 | } |
||
| 229 | } |
||
| 230 | |||
| 231 | $this->setContent($result); |
||
| 232 | |||
| 233 | $this->logger->info('*** Parse successful :)'); |
||
| 234 | |||
| 235 | return true; |
||
| 236 | } |
||
| 237 | |||
| 238 | /** |
||
| 239 | * Creates a DOM Document object and loads the provided HTML on it. |
||
| 240 | * |
||
| 241 | * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text) |
||
| 242 | * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs |
||
| 243 | * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both |
||
| 244 | * objects and ruining the backup. |
||
| 245 | * |
||
| 246 | * @param string $html |
||
| 247 | * |
||
| 248 | * @return DOMDocument |
||
| 249 | */ |
||
| 250 | private function loadHTML($html) |
||
| 251 | { |
||
| 252 | $this->logger->debug('[Loading] Loading HTML...'); |
||
| 253 | |||
| 254 | // To avoid throwing a gazillion of errors on malformed HTMLs |
||
| 255 | libxml_use_internal_errors(true); |
||
| 256 | |||
| 257 | $dom = new DOMDocument('1.0', 'utf-8'); |
||
| 258 | |||
| 259 | if (!$this->configuration->getSubstituteEntities()) { |
||
| 260 | // Keep the original HTML entities |
||
| 261 | $dom->substituteEntities = false; |
||
| 262 | } |
||
| 263 | |||
| 264 | if ($this->configuration->getNormalizeEntities()) { |
||
| 265 | $this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.'); |
||
| 266 | // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content |
||
| 267 | $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'); |
||
| 268 | } |
||
| 269 | |||
| 270 | if ($this->configuration->getSummonCthulhu()) { |
||
| 271 | $this->logger->debug('[Loading] Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘'); |
||
| 272 | $html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html); |
||
| 273 | } |
||
| 274 | |||
| 275 | // Prepend the XML tag to avoid having issues with special characters. Should be harmless. |
||
| 276 | $dom->loadHTML('<?xml encoding="UTF-8">'.$html); |
||
| 277 | $dom->encoding = 'UTF-8'; |
||
| 278 | |||
| 279 | $this->removeScripts($dom); |
||
| 280 | |||
| 281 | $this->prepDocument($dom); |
||
| 282 | |||
| 283 | $this->logger->debug('[Loading] Loaded HTML successfully.'); |
||
| 284 | |||
| 285 | return $dom; |
||
| 286 | } |
||
| 287 | |||
| 288 | /** |
||
| 289 | * Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties. |
||
| 290 | */ |
||
| 291 | private function getMetadata() |
||
| 292 | { |
||
| 293 | $this->logger->debug('[Metadata] Retrieving metadata...'); |
||
| 294 | |||
| 295 | $values = []; |
||
| 296 | // property is a space-separated list of values |
||
| 297 | $propertyPattern = '/\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|image|site_name)(?!:)\s*/i'; |
||
| 298 | |||
| 299 | // name is a single value |
||
| 300 | $namePattern = '/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image|site_name)(?!:)\s*$/i'; |
||
| 301 | |||
| 302 | // Find description tags. |
||
| 303 | foreach ($this->dom->getElementsByTagName('meta') as $meta) { |
||
| 304 | /* @var DOMNode $meta */ |
||
| 305 | $elementName = $meta->getAttribute('name'); |
||
| 306 | $elementProperty = $meta->getAttribute('property'); |
||
| 307 | $content = $meta->getAttribute('content'); |
||
| 308 | $matches = null; |
||
| 309 | $name = null; |
||
| 310 | |||
| 311 | if ($elementProperty) { |
||
| 312 | if (preg_match($propertyPattern, $elementProperty, $matches)) { |
||
| 313 | for ($i = count($matches) - 1; $i >= 0; $i--) { |
||
| 314 | // Convert to lowercase, and remove any whitespace |
||
| 315 | // so we can match below. |
||
| 316 | $name = preg_replace('/\s/', '', mb_strtolower($matches[$i])); |
||
| 317 | // multiple authors |
||
| 318 | $values[$name] = trim($content); |
||
| 319 | } |
||
| 320 | } |
||
| 321 | } |
||
| 322 | |||
| 323 | if (!$matches && $elementName && preg_match($namePattern, $elementName)) { |
||
| 324 | $name = $elementName; |
||
| 325 | if ($content) { |
||
| 326 | // Convert to lowercase, remove any whitespace, and convert dots |
||
| 327 | // to colons so we can match below. |
||
| 328 | $name = preg_replace(['/\s/', '/\./'], ['', ':'], mb_strtolower($name)); |
||
| 329 | $values[$name] = trim($content); |
||
| 330 | } |
||
| 331 | } |
||
| 332 | } |
||
| 333 | |||
| 334 | // get title |
||
| 335 | /* |
||
| 336 | * This is a very convoluted way of extracting the first matching key of the $values array |
||
| 337 | * against a set of options. |
||
| 338 | * |
||
| 339 | * This could be easily replaced with an ugly set of isset($values['key']) or a bunch of ??s. |
||
| 340 | * Will probably replace it with ??s after dropping support of PHP5.6 |
||
| 341 | */ |
||
| 342 | $key = current(array_intersect([ |
||
| 343 | 'dc:title', |
||
| 344 | 'dcterm:title', |
||
| 345 | 'og:title', |
||
| 346 | 'weibo:article:title', |
||
| 347 | 'weibo:webpage:title', |
||
| 348 | 'title', |
||
| 349 | 'twitter:title' |
||
| 350 | ], array_keys($values))); |
||
| 351 | |||
| 352 | $this->setTitle(isset($values[$key]) ? trim($values[$key]) : null); |
||
| 353 | |||
| 354 | if (!$this->getTitle()) { |
||
| 355 | $this->setTitle($this->getArticleTitle()); |
||
| 356 | } |
||
| 357 | |||
| 358 | // get author |
||
| 359 | $key = current(array_intersect([ |
||
| 360 | 'dc:creator', |
||
| 361 | 'dcterm:creator', |
||
| 362 | 'author' |
||
| 363 | ], array_keys($values))); |
||
| 364 | |||
| 365 | $this->setAuthor(isset($values[$key]) ? $values[$key] : null); |
||
| 366 | |||
| 367 | // get description |
||
| 368 | $key = current(array_intersect([ |
||
| 369 | 'dc:description', |
||
| 370 | 'dcterm:description', |
||
| 371 | 'og:description', |
||
| 372 | 'weibo:article:description', |
||
| 373 | 'weibo:webpage:description', |
||
| 374 | 'description', |
||
| 375 | 'twitter:description' |
||
| 376 | ], array_keys($values))); |
||
| 377 | |||
| 378 | $this->setExcerpt(isset($values[$key]) ? $values[$key] : null); |
||
| 379 | |||
| 380 | // get main image |
||
| 381 | $key = current(array_intersect([ |
||
| 382 | 'image', |
||
| 383 | 'og:image', |
||
| 384 | 'twitter:image' |
||
| 385 | ], array_keys($values))); |
||
| 386 | |||
| 387 | $this->setImage(isset($values[$key]) ? $values[$key] : null); |
||
| 388 | |||
| 389 | $key = current(array_intersect([ |
||
| 390 | 'og:site_name' |
||
| 391 | ], array_keys($values))); |
||
| 392 | |||
| 393 | $this->setSiteName(isset($values[$key]) ? $values[$key] : null); |
||
| 394 | } |
||
| 395 | |||
| 396 | /** |
||
| 397 | * Returns all the images of the parsed article. |
||
| 398 | * |
||
| 399 | * @return array |
||
| 400 | */ |
||
| 401 | public function getImages() |
||
| 402 | { |
||
| 403 | $result = []; |
||
| 404 | if ($this->getImage()) { |
||
| 405 | $result[] = $this->getImage(); |
||
| 406 | } |
||
| 407 | |||
| 408 | if (null == $this->getDOMDocument()) { |
||
| 409 | return $result; |
||
| 410 | } |
||
| 411 | |||
| 412 | foreach ($this->getDOMDocument()->getElementsByTagName('img') as $img) { |
||
| 413 | if ($src = $img->getAttribute('src')) { |
||
| 414 | $result[] = $src; |
||
| 415 | } |
||
| 416 | } |
||
| 417 | |||
| 418 | if ($this->configuration->getFixRelativeURLs()) { |
||
| 419 | foreach ($result as &$imgSrc) { |
||
| 420 | $imgSrc = $this->toAbsoluteURI($imgSrc); |
||
| 421 | } |
||
| 422 | } |
||
| 423 | |||
| 424 | $result = array_unique(array_filter($result)); |
||
| 425 | |||
| 426 | return $result; |
||
| 427 | } |
||
| 428 | |||
| 429 | /** |
||
| 430 | * Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't |
||
| 431 | * find a correct image. |
||
| 432 | */ |
||
| 433 | public function getMainImage() |
||
| 434 | { |
||
| 435 | $imgUrl = false; |
||
| 436 | |||
| 437 | if ($this->getImage() !== null) { |
||
| 438 | $imgUrl = $this->getImage(); |
||
| 439 | } |
||
| 440 | |||
| 441 | if (!$imgUrl) { |
||
| 442 | foreach ($this->dom->getElementsByTagName('link') as $link) { |
||
| 443 | /** @var \DOMElement $link */ |
||
| 444 | /* |
||
| 445 | * Check for the rel attribute, then check if the rel attribute is either img_src or image_src, and |
||
| 446 | * finally check for the existence of the href attribute, which should hold the image url. |
||
| 447 | */ |
||
| 448 | if ($link->hasAttribute('rel') && ($link->getAttribute('rel') === 'img_src' || $link->getAttribute('rel') === 'image_src') && $link->hasAttribute('href')) { |
||
| 449 | $imgUrl = $link->getAttribute('href'); |
||
| 450 | break; |
||
| 451 | } |
||
| 452 | } |
||
| 453 | } |
||
| 454 | |||
| 455 | if (!empty($imgUrl) && $this->configuration->getFixRelativeURLs()) { |
||
| 456 | $this->setImage($this->toAbsoluteURI($imgUrl)); |
||
| 457 | } |
||
| 458 | } |
||
| 459 | |||
| 460 | /** |
||
| 461 | * Returns the title of the html. Prioritizes the title from the metadata against the title tag. |
||
| 462 | * |
||
| 463 | * @return string|null |
||
| 464 | */ |
||
| 465 | private function getArticleTitle() |
||
| 466 | { |
||
| 467 | $originalTitle = null; |
||
| 468 | |||
| 469 | if ($this->getTitle()) { |
||
| 470 | $originalTitle = $this->getTitle(); |
||
| 471 | } else { |
||
| 472 | $this->logger->debug('[Metadata] Could not find title in metadata, searching for the title tag...'); |
||
| 473 | $titleTag = $this->dom->getElementsByTagName('title'); |
||
| 474 | if ($titleTag->length > 0) { |
||
| 475 | $this->logger->info(sprintf('[Metadata] Using title tag as article title: \'%s\'', $titleTag->item(0)->nodeValue)); |
||
| 476 | $originalTitle = $titleTag->item(0)->nodeValue; |
||
| 477 | } |
||
| 478 | } |
||
| 479 | |||
| 480 | if ($originalTitle === null) { |
||
| 481 | return null; |
||
| 482 | } |
||
| 483 | |||
| 484 | $curTitle = $originalTitle = trim($originalTitle); |
||
| 485 | $titleHadHierarchicalSeparators = false; |
||
| 486 | |||
| 487 | /* |
||
| 488 | * If there's a separator in the title, first remove the final part |
||
| 489 | * |
||
| 490 | * Sanity warning: if you eval this match in PHPStorm's "Evaluate expression" box, it will return false |
||
| 491 | * I can assure you it works properly if you let the code run. |
||
| 492 | */ |
||
| 493 | if (preg_match('/ [\|\-\\\\\/>»] /i', $curTitle)) { |
||
| 494 | $titleHadHierarchicalSeparators = (bool) preg_match('/ [\\\\\/>»] /', $curTitle); |
||
| 495 | $curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle); |
||
| 496 | |||
| 497 | $this->logger->info(sprintf('[Metadata] Found hierarchical separators in title, new title is: \'%s\'', $curTitle)); |
||
| 498 | |||
| 499 | // If the resulting title is too short (3 words or fewer), remove |
||
| 500 | // the first part instead: |
||
| 501 | if (count(preg_split('/\s+/', $curTitle)) < 3) { |
||
| 502 | $curTitle = preg_replace('/[^\|\-\\\\\/>»]*[\|\-\\\\\/>»](.*)/i', '$1', $originalTitle); |
||
| 503 | $this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle)); |
||
| 504 | } |
||
| 505 | } elseif (strpos($curTitle, ': ') !== false) { |
||
| 506 | // Check if we have an heading containing this exact string, so we |
||
| 507 | // could assume it's the full title. |
||
| 508 | $match = false; |
||
| 509 | for ($i = 1; $i <= 2; $i++) { |
||
| 510 | foreach ($this->dom->getElementsByTagName('h'.$i) as $hTag) { |
||
| 511 | // Trim texts to avoid having false negatives when the title is surrounded by spaces or tabs |
||
| 512 | if (trim($hTag->nodeValue) === trim($curTitle)) { |
||
| 513 | $match = true; |
||
| 514 | } |
||
| 515 | } |
||
| 516 | } |
||
| 517 | |||
| 518 | // If we don't, let's extract the title out of the original title string. |
||
| 519 | if (!$match) { |
||
| 520 | $curTitle = substr($originalTitle, strrpos($originalTitle, ':') + 1); |
||
| 521 | |||
| 522 | $this->logger->info(sprintf('[Metadata] Title has a colon in the middle, new title is: \'%s\'', $curTitle)); |
||
| 523 | |||
| 524 | // If the title is now too short, try the first colon instead: |
||
| 525 | if (count(preg_split('/\s+/', $curTitle)) < 3) { |
||
| 526 | $curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1); |
||
| 527 | $this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle)); |
||
| 528 | } elseif (count(preg_split('/\s+/', substr($curTitle, 0, strpos($curTitle, ':')))) > 5) { |
||
| 529 | // But if we have too many words before the colon there's something weird |
||
| 530 | // with the titles and the H tags so let's just use the original title instead |
||
| 531 | $curTitle = $originalTitle; |
||
| 532 | } |
||
| 533 | } |
||
| 534 | } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) { |
||
| 535 | $hOnes = $this->dom->getElementsByTagName('h1'); |
||
| 536 | |||
| 537 | if ($hOnes->length === 1) { |
||
| 538 | $curTitle = $hOnes->item(0)->nodeValue; |
||
| 539 | $this->logger->info(sprintf('[Metadata] Using title from an H1 node: \'%s\'', $curTitle)); |
||
| 540 | } |
||
| 541 | } |
||
| 542 | |||
| 543 | $curTitle = trim($curTitle); |
||
| 544 | |||
| 545 | /* |
||
| 546 | * If we now have 4 words or fewer as our title, and either no |
||
| 547 | * 'hierarchical' separators (\, /, > or ») were found in the original |
||
| 548 | * title or we decreased the number of words by more than 1 word, use |
||
| 549 | * the original title. |
||
| 550 | */ |
||
| 551 | $curTitleWordCount = count(preg_split('/\s+/', $curTitle)); |
||
| 552 | $originalTitleWordCount = count(preg_split('/\s+/', preg_replace('/[\|\-\\\\\/>»]+/', '', $originalTitle))) - 1; |
||
| 553 | |||
| 554 | if ($curTitleWordCount <= 4 && |
||
| 555 | (!$titleHadHierarchicalSeparators || $curTitleWordCount !== $originalTitleWordCount)) { |
||
| 556 | $curTitle = $originalTitle; |
||
| 557 | |||
| 558 | $this->logger->info(sprintf('Using title from an H1 node: \'%s\'', $curTitle)); |
||
| 559 | } |
||
| 560 | |||
| 561 | return $curTitle; |
||
| 562 | } |
||
| 563 | |||
| 564 | /** |
||
| 565 | * Convert URI to an absolute URI. |
||
| 566 | * |
||
| 567 | * @param $uri string URI to convert |
||
| 568 | * |
||
| 569 | * @return string |
||
| 570 | */ |
||
| 571 | private function toAbsoluteURI($uri) |
||
| 572 | { |
||
| 573 | list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->configuration->getOriginalURL()); |
||
| 574 | |||
| 575 | // If this is already an absolute URI, return it. |
||
| 576 | if (preg_match('/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/', $uri)) { |
||
| 577 | return $uri; |
||
| 578 | } |
||
| 579 | |||
| 580 | // Scheme-rooted relative URI. |
||
| 581 | if (substr($uri, 0, 2) === '//') { |
||
| 582 | return $scheme.'://'.substr($uri, 2); |
||
| 583 | } |
||
| 584 | |||
| 585 | // Prepath-rooted relative URI. |
||
| 586 | if (substr($uri, 0, 1) === '/') { |
||
| 587 | return $prePath.$uri; |
||
| 588 | } |
||
| 589 | |||
| 590 | // Dotslash relative URI. |
||
| 591 | if (strpos($uri, './') === 0) { |
||
| 592 | return $pathBase.substr($uri, 2); |
||
| 593 | } |
||
| 594 | // Ignore hash URIs: |
||
| 595 | if (substr($uri, 0, 1) === '#') { |
||
| 596 | return $uri; |
||
| 597 | } |
||
| 598 | |||
| 599 | // Standard relative URI; add entire path. pathBase already includes a |
||
| 600 | // trailing "/". |
||
| 601 | return $pathBase.$uri; |
||
| 602 | } |
||
| 603 | |||
| 604 | /** |
||
| 605 | * Returns full path info of an URL. |
||
| 606 | * |
||
| 607 | * @param string $url |
||
| 608 | * |
||
| 609 | * @return array [$pathBase, $scheme, $prePath] |
||
| 610 | */ |
||
| 611 | public function getPathInfo($url) |
||
| 612 | { |
||
| 613 | // Check for base URLs |
||
| 614 | if ($this->dom->baseURI !== null) { |
||
| 615 | if (substr($this->dom->baseURI, 0, 1) === '/') { |
||
| 616 | // URLs starting with '/' override completely the URL defined in the link |
||
| 617 | $pathBase = parse_url($url, PHP_URL_SCHEME).'://'.parse_url($url, PHP_URL_HOST).$this->dom->baseURI; |
||
| 618 | } else { |
||
| 619 | // Otherwise just prepend the base to the actual path |
||
| 620 | $pathBase = parse_url($url, PHP_URL_SCHEME).'://'.parse_url($url, PHP_URL_HOST).dirname(parse_url($url, PHP_URL_PATH)).'/'.rtrim($this->dom->baseURI, '/').'/'; |
||
| 621 | } |
||
| 622 | } else { |
||
| 623 | $pathBase = parse_url($url, PHP_URL_SCHEME).'://'.parse_url($url, PHP_URL_HOST).dirname(parse_url($url, PHP_URL_PATH)).'/'; |
||
| 624 | } |
||
| 625 | |||
| 626 | $scheme = parse_url($pathBase, PHP_URL_SCHEME); |
||
| 627 | $prePath = $scheme.'://'.parse_url($pathBase, PHP_URL_HOST); |
||
| 628 | |||
| 629 | return [$pathBase, $scheme, $prePath]; |
||
| 630 | } |
||
| 631 | |||
| 632 | /** |
||
| 633 | * Gets nodes from the root element. |
||
| 634 | * |
||
| 635 | * @param $node DOMNode|DOMText |
||
| 636 | * |
||
| 637 | * @return array |
||
| 638 | */ |
||
| 639 | private function getNodes($node) |
||
| 640 | { |
||
| 641 | $this->logger->info('[Get Nodes] Retrieving nodes...'); |
||
| 642 | |||
| 643 | $stripUnlikelyCandidates = $this->configuration->getStripUnlikelyCandidates(); |
||
| 644 | |||
| 645 | $elementsToScore = []; |
||
| 646 | |||
| 647 | /* |
||
| 648 | * First, node prepping. Trash nodes that look cruddy (like ones with the |
||
| 649 | * class name "comment", etc), and turn divs into P tags where they have been |
||
| 650 | * used inappropriately (as in, where they contain no other block level elements.) |
||
| 651 | */ |
||
| 652 | |||
| 653 | while ($node) { |
||
| 654 | // Remove DOMComments nodes as we don't need them and mess up children counting |
||
| 655 | if ($node->nodeType === XML_COMMENT_NODE) { |
||
| 656 | $this->logger->debug(sprintf('[Get Nodes] Found comment node, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128))); |
||
| 657 | $node = NodeUtility::removeAndGetNext($node); |
||
| 658 | continue; |
||
| 659 | } |
||
| 660 | |||
| 661 | $matchString = $node->getAttribute('class').' '.$node->getAttribute('id'); |
||
| 662 | |||
| 663 | if (!$node->isProbablyVisible()) { |
||
| 664 | $this->logger->debug(sprintf('[Get Nodes] Removing hidden node... Match string was: \'%s\'', $matchString)); |
||
| 665 | $node = NodeUtility::removeAndGetNext($node); |
||
| 666 | continue; |
||
| 667 | } |
||
| 668 | |||
| 669 | // Check to see if this node is a byline, and remove it if it is. |
||
| 670 | if ($this->checkByline($node, $matchString)) { |
||
| 671 | $this->logger->debug(sprintf('[Get Nodes] Found byline, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128))); |
||
| 672 | $node = NodeUtility::removeAndGetNext($node); |
||
| 673 | continue; |
||
| 674 | } |
||
| 675 | |||
| 676 | // Remove unlikely candidates |
||
| 677 | if ($stripUnlikelyCandidates) { |
||
| 678 | if ( |
||
| 679 | preg_match(NodeUtility::$regexps['unlikelyCandidates'], $matchString) && |
||
| 680 | !preg_match(NodeUtility::$regexps['okMaybeItsACandidate'], $matchString) && |
||
| 681 | $node->nodeName !== 'body' && |
||
| 682 | $node->nodeName !== 'a' |
||
| 683 | ) { |
||
| 684 | $this->logger->debug(sprintf('[Get Nodes] Removing unlikely candidate. Node content was: \'%s\'', substr($node->nodeValue, 0, 128))); |
||
| 685 | $node = NodeUtility::removeAndGetNext($node); |
||
| 686 | continue; |
||
| 687 | } |
||
| 688 | } |
||
| 689 | |||
| 690 | // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). |
||
| 691 | if (($node->nodeName === 'div' || $node->nodeName === 'section' || $node->nodeName === 'header' || |
||
| 692 | $node->nodeName === 'h1' || $node->nodeName === 'h2' || $node->nodeName === 'h3' || |
||
| 693 | $node->nodeName === 'h4' || $node->nodeName === 'h5' || $node->nodeName === 'h6' || |
||
| 694 | $node->nodeName === 'p') && |
||
| 695 | $node->isElementWithoutContent()) { |
||
| 696 | $this->logger->debug(sprintf('[Get Nodes] Removing empty \'%s\' node.', $node->nodeName)); |
||
| 697 | $node = NodeUtility::removeAndGetNext($node); |
||
| 698 | continue; |
||
| 699 | } |
||
| 700 | |||
| 701 | if (in_array(strtolower($node->nodeName), $this->defaultTagsToScore)) { |
||
| 702 | $this->logger->debug(sprintf('[Get Nodes] Adding node to score list, node content is: \'%s\'', substr($node->nodeValue, 0, 128))); |
||
| 703 | $elementsToScore[] = $node; |
||
| 704 | } |
||
| 705 | |||
| 706 | // Turn all divs that don't have children block level elements into p's |
||
| 707 | if ($node->nodeName === 'div') { |
||
| 708 | // Put phrasing content into paragraphs. |
||
| 709 | $p = null; |
||
| 710 | $childNode = $node->firstChild; |
||
| 711 | while ($childNode) { |
||
| 712 | $nextSibling = $childNode->nextSibling; |
||
| 713 | if ($childNode->isPhrasingContent()) { |
||
| 714 | if ($p !== null) { |
||
| 715 | $p->appendChild($childNode); |
||
| 716 | } elseif (!$childNode->isWhitespace()) { |
||
| 717 | $p = $this->dom->createElement('p'); |
||
| 718 | $node->replaceChild($p, $childNode); |
||
| 719 | $p->appendChild($childNode); |
||
| 720 | } |
||
| 721 | } elseif ($p !== null) { |
||
| 722 | while ($p->lastChild && $p->lastChild->isWhitespace()) { |
||
| 723 | $p->removeChild($p->lastChild); |
||
| 724 | } |
||
| 725 | $p = null; |
||
| 726 | } |
||
| 727 | $childNode = $nextSibling; |
||
| 728 | } |
||
| 729 | |||
| 730 | /* |
||
| 731 | * Sites like http://mobile.slate.com encloses each paragraph with a DIV |
||
| 732 | * element. DIVs with only a P element inside and no text content can be |
||
| 733 | * safely converted into plain P elements to avoid confusing the scoring |
||
| 734 | * algorithm with DIVs with are, in practice, paragraphs. |
||
| 735 | */ |
||
| 736 | if ($node->hasSingleTagInsideElement('p') && $node->getLinkDensity() < 0.25) { |
||
| 737 | $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128))); |
||
| 738 | $pNode = NodeUtility::filterTextNodes($node->childNodes)->item(0); |
||
| 739 | $node->parentNode->replaceChild($pNode, $node); |
||
| 740 | $node = $pNode; |
||
| 741 | $elementsToScore[] = $node; |
||
| 742 | } elseif (!$node->hasSingleChildBlockElement()) { |
||
| 743 | $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single child block element, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue, 0, 128))); |
||
| 744 | $node = NodeUtility::setNodeTag($node, 'p'); |
||
| 745 | $elementsToScore[] = $node; |
||
| 746 | } |
||
| 747 | } |
||
| 748 | |||
| 749 | $node = NodeUtility::getNextNode($node); |
||
| 750 | } |
||
| 751 | |||
| 752 | return $elementsToScore; |
||
| 753 | } |
||
| 754 | |||
| 755 | /** |
||
| 756 | * Checks if the node is a byline. |
||
| 757 | * |
||
| 758 | * @param DOMNode $node |
||
| 759 | * @param string $matchString |
||
| 760 | * |
||
| 761 | * @return bool |
||
| 762 | */ |
||
| 763 | private function checkByline($node, $matchString) |
||
| 764 | { |
||
| 765 | if (!$this->configuration->getArticleByLine()) { |
||
| 766 | return false; |
||
| 767 | } |
||
| 768 | |||
| 769 | /* |
||
| 770 | * Check if the byline is already set |
||
| 771 | */ |
||
| 772 | if ($this->getAuthor()) { |
||
| 773 | return false; |
||
| 774 | } |
||
| 775 | |||
| 776 | $rel = $node->getAttribute('rel'); |
||
| 777 | |||
| 778 | if ($rel === 'author' || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) { |
||
| 779 | $this->logger->info(sprintf('[Metadata] Found article author: \'%s\'', $node->getTextContent())); |
||
| 780 | $this->setAuthor(trim($node->getTextContent())); |
||
| 781 | |||
| 782 | return true; |
||
| 783 | } |
||
| 784 | |||
| 785 | return false; |
||
| 786 | } |
||
| 787 | |||
| 788 | /** |
||
| 789 | * Checks the validity of a byLine. Based on string length. |
||
| 790 | * |
||
| 791 | * @param string $text |
||
| 792 | * |
||
| 793 | * @return bool |
||
| 794 | */ |
||
| 795 | private function isValidByline($text) |
||
| 796 | { |
||
| 797 | if (gettype($text) == 'string') { |
||
| 798 | $byline = trim($text); |
||
| 799 | |||
| 800 | return (mb_strlen($byline) > 0) && (mb_strlen($byline) < 100); |
||
| 801 | } |
||
| 802 | |||
| 803 | return false; |
||
| 804 | } |
||
| 805 | |||
| 806 | /** |
||
| 807 | * Removes all the scripts of the html. |
||
| 808 | * |
||
| 809 | * @param DOMDocument $dom |
||
| 810 | */ |
||
| 811 | private function removeScripts(DOMDocument $dom) |
||
| 812 | { |
||
| 813 | foreach (['script', 'noscript'] as $tag) { |
||
| 814 | $nodes = $dom->getElementsByTagName($tag); |
||
| 815 | foreach (iterator_to_array($nodes) as $node) { |
||
| 816 | NodeUtility::removeNode($node); |
||
| 817 | } |
||
| 818 | } |
||
| 819 | } |
||
| 820 | |||
| 821 | /** |
||
| 822 | * Prepares the document for parsing. |
||
| 823 | * |
||
| 824 | * @param DOMDocument $dom |
||
| 825 | */ |
||
| 826 | private function prepDocument(DOMDocument $dom) |
||
| 827 | { |
||
| 828 | $this->logger->info('[PrepDocument] Preparing document for parsing...'); |
||
| 829 | |||
| 830 | foreach ($dom->shiftingAwareGetElementsByTagName('br') as $br) { |
||
| 831 | $next = $br->nextSibling; |
||
| 832 | |||
| 833 | /* |
||
| 834 | * Whether 2 or more <br> elements have been found and replaced with a |
||
| 835 | * <p> block. |
||
| 836 | */ |
||
| 837 | $replaced = false; |
||
| 838 | |||
| 839 | /* |
||
| 840 | * If we find a <br> chain, remove the <br>s until we hit another element |
||
| 841 | * or non-whitespace. This leaves behind the first <br> in the chain |
||
| 842 | * (which will be replaced with a <p> later). |
||
| 843 | */ |
||
| 844 | while (($next = NodeUtility::nextElement($next)) && ($next->nodeName === 'br')) { |
||
| 845 | $this->logger->debug('[PrepDocument] Removing chain of BR nodes...'); |
||
| 846 | |||
| 847 | $replaced = true; |
||
| 848 | $brSibling = $next->nextSibling; |
||
| 849 | $next->parentNode->removeChild($next); |
||
| 850 | $next = $brSibling; |
||
| 851 | } |
||
| 852 | |||
| 853 | /* |
||
| 854 | * If we removed a <br> chain, replace the remaining <br> with a <p>. Add |
||
| 855 | * all sibling nodes as children of the <p> until we hit another <br> |
||
| 856 | * chain. |
||
| 857 | */ |
||
| 858 | |||
| 859 | if ($replaced) { |
||
| 860 | $p = $dom->createElement('p'); |
||
| 861 | $br->parentNode->replaceChild($p, $br); |
||
| 862 | |||
| 863 | $next = $p->nextSibling; |
||
| 864 | while ($next) { |
||
| 865 | // If we've hit another <br><br>, we're done adding children to this <p>. |
||
| 866 | if ($next->nodeName === 'br') { |
||
| 867 | $nextElem = NodeUtility::nextElement($next->nextSibling); |
||
| 868 | if ($nextElem && $nextElem->nodeName === 'br') { |
||
| 869 | break; |
||
| 870 | } |
||
| 871 | } |
||
| 872 | |||
| 873 | if (!$next->isPhrasingContent()) { |
||
| 874 | break; |
||
| 875 | } |
||
| 876 | |||
| 877 | $this->logger->debug('[PrepDocument] Replacing BR with a P node...'); |
||
| 878 | |||
| 879 | // Otherwise, make this node a child of the new <p>. |
||
| 880 | $sibling = $next->nextSibling; |
||
| 881 | $p->appendChild($next); |
||
| 882 | $next = $sibling; |
||
| 883 | } |
||
| 884 | |||
| 885 | while ($p->lastChild && $p->lastChild->isWhitespace()) { |
||
| 886 | $p->removeChild($p->lastChild); |
||
| 887 | } |
||
| 888 | |||
| 889 | if ($p->parentNode->tagName === 'p') { |
||
| 890 | NodeUtility::setNodeTag($p->parentNode, 'div'); |
||
| 891 | } |
||
| 892 | } |
||
| 893 | } |
||
| 894 | |||
| 895 | // Replace font tags with span |
||
| 896 | $fonts = $dom->getElementsByTagName('font'); |
||
| 897 | $length = $fonts->length; |
||
| 898 | for ($i = 0; $i < $length; $i++) { |
||
| 899 | $this->logger->debug('[PrepDocument] Converting font tag into a span tag.'); |
||
| 900 | $font = $fonts->item($length - 1 - $i); |
||
| 901 | NodeUtility::setNodeTag($font, 'span'); |
||
| 902 | } |
||
| 903 | } |
||
| 904 | |||
| 905 | /** |
||
| 906 | * Assign scores to each node. Returns full article parsed or false on error. |
||
| 907 | * |
||
| 908 | * @param array $nodes |
||
| 909 | * |
||
| 910 | * @return DOMDocument|bool |
||
| 911 | */ |
||
| 912 | private function rateNodes($nodes) |
||
| 913 | { |
||
| 914 | $this->logger->info('[Rating] Rating nodes...'); |
||
| 915 | |||
| 916 | $candidates = []; |
||
| 917 | |||
| 918 | /** @var DOMElement $node */ |
||
| 919 | foreach ($nodes as $node) { |
||
| 920 | if (is_null($node->parentNode)) { |
||
| 921 | continue; |
||
| 922 | } |
||
| 923 | |||
| 924 | // Discard nodes with less than 25 characters, without blank space |
||
| 925 | if (mb_strlen($node->getTextContent(true)) < 25) { |
||
| 926 | continue; |
||
| 927 | } |
||
| 928 | |||
| 929 | $ancestors = $node->getNodeAncestors(); |
||
| 930 | |||
| 931 | // Exclude nodes with no ancestor |
||
| 932 | if (count($ancestors) === 0) { |
||
| 933 | continue; |
||
| 934 | } |
||
| 935 | |||
| 936 | // Start with a point for the paragraph itself as a base. |
||
| 937 | $contentScore = 1; |
||
| 938 | |||
| 939 | // Add points for any commas within this paragraph. |
||
| 940 | $contentScore += count(explode(',', $node->getTextContent(true))); |
||
| 941 | |||
| 942 | // For every 100 characters in this paragraph, add another point. Up to 3 points. |
||
| 943 | $contentScore += min(floor(mb_strlen($node->getTextContent(true)) / 100), 3); |
||
| 944 | |||
| 945 | $this->logger->debug(sprintf('[Rating] Node score %s, content: \'%s\'', $contentScore, substr($node->nodeValue, 0, 128))); |
||
| 946 | |||
| 947 | /** @var $ancestor DOMElement */ |
||
| 948 | foreach ($ancestors as $level => $ancestor) { |
||
| 949 | $this->logger->debug('[Rating] Found ancestor, initializing and adding it as a candidate...'); |
||
| 950 | if (!$ancestor->isInitialized()) { |
||
| 951 | $ancestor->initializeNode($this->configuration->getWeightClasses()); |
||
| 952 | $candidates[] = $ancestor; |
||
| 953 | } |
||
| 954 | |||
| 955 | /* |
||
| 956 | * Node score divider: |
||
| 957 | * - parent: 1 (no division) |
||
| 958 | * - grandparent: 2 |
||
| 959 | * - great grandparent+: ancestor level * 3 |
||
| 960 | */ |
||
| 961 | |||
| 962 | if ($level === 0) { |
||
| 963 | $scoreDivider = 1; |
||
| 964 | } elseif ($level === 1) { |
||
| 965 | $scoreDivider = 2; |
||
| 966 | } else { |
||
| 967 | $scoreDivider = $level * 3; |
||
| 968 | } |
||
| 969 | |||
| 970 | $currentScore = $ancestor->contentScore; |
||
| 971 | $ancestor->contentScore = $currentScore + ($contentScore / $scoreDivider); |
||
| 972 | |||
| 973 | $this->logger->debug(sprintf('[Rating] Ancestor score %s, value: \'%s\'', $ancestor->contentScore, substr($ancestor->nodeValue, 0, 128))); |
||
| 974 | } |
||
| 975 | } |
||
| 976 | |||
| 977 | /* |
||
| 978 | * After we've calculated scores, loop through all of the possible |
||
| 979 | * candidate nodes we found and find the one with the highest score. |
||
| 980 | */ |
||
| 981 | |||
| 982 | $topCandidates = []; |
||
| 983 | foreach ($candidates as $candidate) { |
||
| 984 | |||
| 985 | /* |
||
| 986 | * Scale the final candidates score based on link density. Good content |
||
| 987 | * should have a relatively small link density (5% or less) and be mostly |
||
| 988 | * unaffected by this operation. |
||
| 989 | */ |
||
| 990 | |||
| 991 | $candidate->contentScore = $candidate->contentScore * (1 - $candidate->getLinkDensity()); |
||
| 992 | |||
| 993 | for ($i = 0; $i < $this->configuration->getMaxTopCandidates(); $i++) { |
||
| 994 | $aTopCandidate = isset($topCandidates[$i]) ? $topCandidates[$i] : null; |
||
| 995 | |||
| 996 | if (!$aTopCandidate || $candidate->contentScore > $aTopCandidate->contentScore) { |
||
| 997 | array_splice($topCandidates, $i, 0, [$candidate]); |
||
| 998 | if (count($topCandidates) > $this->configuration->getMaxTopCandidates()) { |
||
| 999 | array_pop($topCandidates); |
||
| 1000 | } |
||
| 1001 | break; |
||
| 1002 | } |
||
| 1003 | } |
||
| 1004 | } |
||
| 1005 | |||
| 1006 | $topCandidate = isset($topCandidates[0]) ? $topCandidates[0] : null; |
||
| 1007 | $parentOfTopCandidate = null; |
||
| 1008 | |||
| 1009 | /* |
||
| 1010 | * If we still have no top candidate, just use the body as a last resort. |
||
| 1011 | * We also have to copy the body node so it is something we can modify. |
||
| 1012 | */ |
||
| 1013 | |||
| 1014 | if ($topCandidate === null || $topCandidate->nodeName === 'body') { |
||
| 1015 | $this->logger->info('[Rating] No top candidate found or top candidate is the body tag. Moving all child nodes to a new DIV node.'); |
||
| 1016 | |||
| 1017 | // Move all of the page's children into topCandidate |
||
| 1018 | $topCandidate = new DOMDocument('1.0', 'utf-8'); |
||
| 1019 | $topCandidate->encoding = 'UTF-8'; |
||
| 1020 | $topCandidate->appendChild($topCandidate->createElement('div', '')); |
||
| 1021 | $kids = $this->dom->getElementsByTagName('body')->item(0)->childNodes; |
||
| 1022 | |||
| 1023 | // Cannot be foreached, don't ask me why. |
||
| 1024 | for ($i = 0; $i < $kids->length; $i++) { |
||
| 1025 | $import = $topCandidate->importNode($kids->item($i), true); |
||
| 1026 | $topCandidate->firstChild->appendChild($import); |
||
| 1027 | } |
||
| 1028 | |||
| 1029 | // Candidate must be created using firstChild to grab the DOMElement instead of the DOMDocument. |
||
| 1030 | $topCandidate = $topCandidate->firstChild; |
||
| 1031 | } elseif ($topCandidate) { |
||
| 1032 | $this->logger->info(sprintf('[Rating] Found top candidate, score: %s', $topCandidate->contentScore)); |
||
| 1033 | // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array |
||
| 1034 | // and whose scores are quite closed with current `topCandidate` node. |
||
| 1035 | $alternativeCandidateAncestors = []; |
||
| 1036 | for ($i = 1; $i < count($topCandidates); $i++) { |
||
| 1037 | // In some cases we may end up with a top candidate with zero content score. To avoid dividing by zero |
||
| 1038 | // we have to use max() and replace zero with a low value like 0.1 |
||
| 1039 | if ($topCandidates[$i]->contentScore / max($topCandidate->contentScore, 0.1) >= 0.75) { |
||
| 1040 | array_push($alternativeCandidateAncestors, $topCandidates[$i]->getNodeAncestors(false)); |
||
| 1041 | } |
||
| 1042 | } |
||
| 1043 | |||
| 1044 | $MINIMUM_TOPCANDIDATES = 3; |
||
| 1045 | if (count($alternativeCandidateAncestors) >= $MINIMUM_TOPCANDIDATES) { |
||
| 1046 | $parentOfTopCandidate = $topCandidate->parentNode; |
||
| 1047 | |||
| 1048 | // Check if we are actually dealing with a DOMNode and not a DOMDocument node or higher |
||
| 1049 | while ($parentOfTopCandidate->nodeName !== 'body' && $parentOfTopCandidate->nodeType === XML_ELEMENT_NODE) { |
||
| 1050 | $listsContainingThisAncestor = 0; |
||
| 1051 | for ($ancestorIndex = 0; $ancestorIndex < count($alternativeCandidateAncestors) && $listsContainingThisAncestor < $MINIMUM_TOPCANDIDATES; $ancestorIndex++) { |
||
| 1052 | $listsContainingThisAncestor += (int) in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex]); |
||
| 1053 | } |
||
| 1054 | if ($listsContainingThisAncestor >= $MINIMUM_TOPCANDIDATES) { |
||
| 1055 | $topCandidate = $parentOfTopCandidate; |
||
| 1056 | break; |
||
| 1057 | } |
||
| 1058 | $parentOfTopCandidate = $parentOfTopCandidate->parentNode; |
||
| 1059 | } |
||
| 1060 | } |
||
| 1061 | |||
| 1062 | /* |
||
| 1063 | * Because of our bonus system, parents of candidates might have scores |
||
| 1064 | * themselves. They get half of the node. There won't be nodes with higher |
||
| 1065 | * scores than our topCandidate, but if we see the score going *up* in the first |
||
| 1066 | * few steps up the tree, that's a decent sign that there might be more content |
||
| 1067 | * lurking in other places that we want to unify in. The sibling stuff |
||
| 1068 | * below does some of that - but only if we've looked high enough up the DOM |
||
| 1069 | * tree. |
||
| 1070 | */ |
||
| 1071 | |||
| 1072 | $parentOfTopCandidate = $topCandidate->parentNode; |
||
| 1073 | $lastScore = $topCandidate->contentScore; |
||
| 1074 | |||
| 1075 | // The scores shouldn't get too low. |
||
| 1076 | $scoreThreshold = $lastScore / 3; |
||
| 1077 | |||
| 1078 | /* @var DOMElement $parentOfTopCandidate */ |
||
| 1079 | while ($parentOfTopCandidate->nodeName !== 'body') { |
||
| 1080 | $parentScore = $parentOfTopCandidate->contentScore; |
||
| 1081 | if ($parentScore < $scoreThreshold) { |
||
| 1082 | break; |
||
| 1083 | } |
||
| 1084 | |||
| 1085 | if ($parentScore > $lastScore) { |
||
| 1086 | // Alright! We found a better parent to use. |
||
| 1087 | $topCandidate = $parentOfTopCandidate; |
||
| 1088 | $this->logger->info('[Rating] Found a better top candidate.'); |
||
| 1089 | break; |
||
| 1090 | } |
||
| 1091 | $lastScore = $parentOfTopCandidate->contentScore; |
||
| 1092 | $parentOfTopCandidate = $parentOfTopCandidate->parentNode; |
||
| 1093 | } |
||
| 1094 | |||
| 1095 | // If the top candidate is the only child, use parent instead. This will help sibling |
||
| 1096 | // joining logic when adjacent content is actually located in parent's sibling node. |
||
| 1097 | $parentOfTopCandidate = $topCandidate->parentNode; |
||
| 1098 | while ($parentOfTopCandidate->nodeName !== 'body' && count(NodeUtility::filterTextNodes($parentOfTopCandidate->childNodes)) === 1) { |
||
| 1099 | $topCandidate = $parentOfTopCandidate; |
||
| 1100 | $parentOfTopCandidate = $topCandidate->parentNode; |
||
| 1101 | } |
||
| 1102 | } |
||
| 1103 | |||
| 1104 | /* |
||
| 1105 | * Now that we have the top candidate, look through its siblings for content |
||
| 1106 | * that might also be related. Things like preambles, content split by ads |
||
| 1107 | * that we removed, etc. |
||
| 1108 | */ |
||
| 1109 | |||
| 1110 | $this->logger->info('[Rating] Creating final article content document...'); |
||
| 1111 | |||
| 1112 | $articleContent = new DOMDocument('1.0', 'utf-8'); |
||
| 1113 | $articleContent->createElement('div'); |
||
| 1114 | |||
| 1115 | $siblingScoreThreshold = max(10, $topCandidate->contentScore * 0.2); |
||
| 1116 | // Keep potential top candidate's parent node to try to get text direction of it later. |
||
| 1117 | $parentOfTopCandidate = $topCandidate->parentNode; |
||
| 1118 | $siblings = $parentOfTopCandidate->childNodes; |
||
| 1119 | |||
| 1120 | $hasContent = false; |
||
| 1121 | |||
| 1122 | $this->logger->info('[Rating] Adding top candidate siblings...'); |
||
| 1123 | |||
| 1124 | /* @var DOMElement $sibling */ |
||
| 1125 | // Can't foreach here because down there we might change the tag name and that causes the foreach to skip items |
||
| 1126 | for ($i = 0; $i < $siblings->length; $i++) { |
||
| 1127 | $sibling = $siblings[$i]; |
||
| 1128 | $append = false; |
||
| 1129 | |||
| 1130 | if ($sibling === $topCandidate) { |
||
| 1131 | $this->logger->debug('[Rating] Sibling is equal to the top candidate, adding to the final article...'); |
||
| 1132 | |||
| 1133 | $append = true; |
||
| 1134 | } else { |
||
| 1135 | $contentBonus = 0; |
||
| 1136 | |||
| 1137 | // Give a bonus if sibling nodes and top candidates have the example same classname |
||
| 1138 | if ($sibling->getAttribute('class') === $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') !== '') { |
||
| 1139 | $contentBonus += $topCandidate->contentScore * 0.2; |
||
| 1140 | } |
||
| 1141 | if ($sibling->contentScore + $contentBonus >= $siblingScoreThreshold) { |
||
| 1142 | $append = true; |
||
| 1143 | } elseif ($sibling->nodeName === 'p') { |
||
| 1144 | $linkDensity = $sibling->getLinkDensity(); |
||
| 1145 | $nodeContent = $sibling->getTextContent(true); |
||
| 1146 | |||
| 1147 | if (mb_strlen($nodeContent) > 80 && $linkDensity < 0.25) { |
||
| 1148 | $append = true; |
||
| 1149 | } elseif ($nodeContent && mb_strlen($nodeContent) < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) { |
||
| 1150 | $append = true; |
||
| 1151 | } |
||
| 1152 | } |
||
| 1153 | } |
||
| 1154 | |||
| 1155 | if ($append) { |
||
| 1156 | $this->logger->debug(sprintf('[Rating] Appending sibling to final article, content is: \'%s\'', substr($sibling->nodeValue, 0, 128))); |
||
| 1157 | |||
| 1158 | $hasContent = true; |
||
| 1159 | |||
| 1160 | if (!in_array(strtolower($sibling->nodeName), $this->alterToDIVExceptions)) { |
||
| 1161 | /* |
||
| 1162 | * We have a node that isn't a common block level element, like a form or td tag. |
||
| 1163 | * Turn it into a div so it doesn't get filtered out later by accident. |
||
| 1164 | */ |
||
| 1165 | $sibling = NodeUtility::setNodeTag($sibling, 'div'); |
||
| 1166 | } |
||
| 1167 | |||
| 1168 | $import = $articleContent->importNode($sibling, true); |
||
| 1169 | $articleContent->appendChild($import); |
||
| 1170 | |||
| 1171 | /* |
||
| 1172 | * No node shifting needs to be check because when calling getChildren, an array is made with the |
||
| 1173 | * children of the parent node, instead of using the DOMElement childNodes function, which, when used |
||
| 1174 | * along with appendChild, would shift the nodes position and the current foreach will behave in |
||
| 1175 | * unpredictable ways. |
||
| 1176 | */ |
||
| 1177 | } |
||
| 1178 | } |
||
| 1179 | |||
| 1180 | $articleContent = $this->prepArticle($articleContent); |
||
| 1181 | |||
| 1182 | if ($hasContent) { |
||
| 1183 | // Find out text direction from ancestors of final top candidate. |
||
| 1184 | $ancestors = array_merge([$parentOfTopCandidate, $topCandidate], $parentOfTopCandidate->getNodeAncestors()); |
||
| 1185 | foreach ($ancestors as $ancestor) { |
||
| 1186 | $articleDir = $ancestor->getAttribute('dir'); |
||
| 1187 | if ($articleDir) { |
||
| 1188 | $this->setDirection($articleDir); |
||
| 1189 | $this->logger->debug(sprintf('[Rating] Found article direction: %s', $articleDir)); |
||
| 1190 | break; |
||
| 1191 | } |
||
| 1192 | } |
||
| 1193 | |||
| 1194 | return $articleContent; |
||
| 1195 | } else { |
||
| 1196 | return false; |
||
| 1197 | } |
||
| 1198 | } |
||
| 1199 | |||
| 1200 | /** |
||
| 1201 | * Cleans up the final article. |
||
| 1202 | * |
||
| 1203 | * @param DOMDocument $article |
||
| 1204 | * |
||
| 1205 | * @return DOMDocument |
||
| 1206 | */ |
||
| 1207 | public function prepArticle(DOMDocument $article) |
||
| 1208 | { |
||
| 1209 | $this->logger->info('[PrepArticle] Preparing final article...'); |
||
| 1210 | |||
| 1211 | $this->_cleanStyles($article); |
||
| 1212 | $this->_clean($article, 'style'); |
||
| 1213 | |||
| 1214 | // Check for data tables before we continue, to avoid removing items in |
||
| 1215 | // those tables, which will often be isolated even though they're |
||
| 1216 | // visually linked to other content-ful elements (text, images, etc.). |
||
| 1217 | $this->_markDataTables($article); |
||
| 1218 | |||
| 1219 | // Clean out junk from the article content |
||
| 1220 | $this->_cleanConditionally($article, 'form'); |
||
| 1221 | $this->_cleanConditionally($article, 'fieldset'); |
||
| 1222 | $this->_clean($article, 'object'); |
||
| 1223 | $this->_clean($article, 'embed'); |
||
| 1224 | $this->_clean($article, 'h1'); |
||
| 1225 | $this->_clean($article, 'footer'); |
||
| 1226 | $this->_clean($article, 'link'); |
||
| 1227 | $this->_clean($article, 'aside'); |
||
| 1228 | |||
| 1229 | // Clean out elements have "share" in their id/class combinations from final top candidates, |
||
| 1230 | // which means we don't remove the top candidates even they have "share". |
||
| 1231 | foreach ($article->childNodes as $child) { |
||
| 1232 | $this->_cleanMatchedNodes($child, '/share/i'); |
||
| 1233 | } |
||
| 1234 | |||
| 1235 | /* |
||
| 1236 | * If there is only one h2 and its text content substantially equals article title, |
||
| 1237 | * they are probably using it as a header and not a subheader, |
||
| 1238 | * so remove it since we already extract the title separately. |
||
| 1239 | */ |
||
| 1240 | $h2 = $article->getElementsByTagName('h2'); |
||
| 1241 | if ($h2->length === 1) { |
||
| 1242 | $lengthSimilarRate = (mb_strlen($h2->item(0)->textContent) - mb_strlen($this->getTitle())) / max(mb_strlen($this->getTitle()), 1); |
||
| 1243 | |||
| 1244 | if (abs($lengthSimilarRate) < 0.5) { |
||
| 1245 | if ($lengthSimilarRate > 0) { |
||
| 1246 | $titlesMatch = strpos($h2->item(0)->textContent, $this->getTitle()) !== false; |
||
| 1247 | } else { |
||
| 1248 | $titlesMatch = strpos($this->getTitle(), $h2->item(0)->textContent) !== false; |
||
| 1249 | } |
||
| 1250 | if ($titlesMatch) { |
||
| 1251 | $this->logger->info('[PrepArticle] Found title repeated in an H2 node, removing...'); |
||
| 1252 | $this->_clean($article, 'h2'); |
||
| 1253 | } |
||
| 1254 | } |
||
| 1255 | } |
||
| 1256 | |||
| 1257 | $this->_clean($article, 'iframe'); |
||
| 1258 | $this->_clean($article, 'input'); |
||
| 1259 | $this->_clean($article, 'textarea'); |
||
| 1260 | $this->_clean($article, 'select'); |
||
| 1261 | $this->_clean($article, 'button'); |
||
| 1262 | $this->_cleanHeaders($article); |
||
| 1263 | |||
| 1264 | // Do these last as the previous stuff may have removed junk |
||
| 1265 | // that will affect these |
||
| 1266 | $this->_cleanConditionally($article, 'table'); |
||
| 1267 | $this->_cleanConditionally($article, 'ul'); |
||
| 1268 | $this->_cleanConditionally($article, 'div'); |
||
| 1269 | |||
| 1270 | $this->_cleanExtraParagraphs($article); |
||
| 1271 | |||
| 1272 | foreach (iterator_to_array($article->getElementsByTagName('br')) as $br) { |
||
| 1273 | $next = $br->nextSibling; |
||
| 1274 | if ($next && $next->nodeName === 'p') { |
||
| 1275 | $this->logger->debug('[PrepArticle] Removing br node next to a p node.'); |
||
| 1276 | $br->parentNode->removeChild($br); |
||
| 1277 | } |
||
| 1278 | } |
||
| 1279 | |||
| 1280 | // Remove single-cell tables |
||
| 1281 | foreach ($article->shiftingAwareGetElementsByTagName('table') as $table) { |
||
| 1282 | /** @var DOMNode $table */ |
||
| 1283 | $tbody = $table->hasSingleTagInsideElement('tbody') ? $table->getFirstElementChild() : $table; |
||
| 1284 | if ($tbody->hasSingleTagInsideElement('tr')) { |
||
| 1285 | $row = $tbody->getFirstElementChild(); |
||
| 1286 | if ($row->hasSingleTagInsideElement('td')) { |
||
| 1287 | $cell = $row->getFirstElementChild(); |
||
| 1288 | $cell = NodeUtility::setNodeTag($cell, (array_reduce(iterator_to_array($cell->childNodes), function($carry, $node) { |
||
| 1289 | return $node->isPhrasingContent() && $carry; |
||
| 1290 | }, true)) ? 'p' : 'div'); |
||
| 1291 | $table->parentNode->replaceChild($cell, $table); |
||
| 1292 | } |
||
| 1293 | } |
||
| 1294 | } |
||
| 1295 | |||
| 1296 | return $article; |
||
| 1297 | } |
||
| 1298 | |||
| 1299 | /** |
||
| 1300 | * Look for 'data' (as opposed to 'layout') tables, for which we use |
||
| 1301 | * similar checks as |
||
| 1302 | * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920. |
||
| 1303 | * |
||
| 1304 | * @param DOMDocument $article |
||
| 1305 | * |
||
| 1306 | * @return void |
||
| 1307 | */ |
||
| 1308 | public function _markDataTables(DOMDocument $article) |
||
| 1309 | { |
||
| 1310 | $tables = $article->getElementsByTagName('table'); |
||
| 1311 | foreach ($tables as $table) { |
||
| 1312 | /** @var DOMElement $table */ |
||
| 1313 | $role = $table->getAttribute('role'); |
||
| 1314 | if ($role === 'presentation') { |
||
| 1315 | $table->setReadabilityDataTable(false); |
||
| 1316 | continue; |
||
| 1317 | } |
||
| 1318 | $datatable = $table->getAttribute('datatable'); |
||
| 1319 | if ($datatable == '0') { |
||
| 1320 | $table->setReadabilityDataTable(false); |
||
| 1321 | continue; |
||
| 1322 | } |
||
| 1323 | $summary = $table->getAttribute('summary'); |
||
| 1324 | if ($summary) { |
||
| 1325 | $table->setReadabilityDataTable(true); |
||
| 1326 | continue; |
||
| 1327 | } |
||
| 1328 | |||
| 1329 | $caption = $table->getElementsByTagName('caption'); |
||
| 1330 | if ($caption->length > 0 && $caption->item(0)->childNodes->length > 0) { |
||
| 1331 | $table->setReadabilityDataTable(true); |
||
| 1332 | continue; |
||
| 1333 | } |
||
| 1334 | |||
| 1335 | // If the table has a descendant with any of these tags, consider a data table: |
||
| 1336 | foreach (['col', 'colgroup', 'tfoot', 'thead', 'th'] as $dataTableDescendants) { |
||
| 1337 | if ($table->getElementsByTagName($dataTableDescendants)->length > 0) { |
||
| 1338 | $table->setReadabilityDataTable(true); |
||
| 1339 | continue 2; |
||
| 1340 | } |
||
| 1341 | } |
||
| 1342 | |||
| 1343 | // Nested tables indicate a layout table: |
||
| 1344 | if ($table->getElementsByTagName('table')->length > 0) { |
||
| 1345 | $table->setReadabilityDataTable(false); |
||
| 1346 | continue; |
||
| 1347 | } |
||
| 1348 | |||
| 1349 | $sizeInfo = $table->getRowAndColumnCount(); |
||
| 1350 | if ($sizeInfo['rows'] >= 10 || $sizeInfo['columns'] > 4) { |
||
| 1351 | $table->setReadabilityDataTable(true); |
||
| 1352 | continue; |
||
| 1353 | } |
||
| 1354 | // Now just go by size entirely: |
||
| 1355 | $table->setReadabilityDataTable($sizeInfo['rows'] * $sizeInfo['columns'] > 10); |
||
| 1356 | } |
||
| 1357 | } |
||
| 1358 | |||
| 1359 | /** |
||
| 1360 | * Remove the style attribute on every e and under. |
||
| 1361 | * |
||
| 1362 | * @param $node DOMDocument|DOMNode |
||
| 1363 | **/ |
||
| 1364 | public function _cleanStyles($node) |
||
| 1365 | { |
||
| 1366 | if (property_exists($node, 'tagName') && $node->tagName === 'svg') { |
||
| 1367 | return; |
||
| 1368 | } |
||
| 1369 | |||
| 1370 | // Do not bother if there's no method to remove an attribute |
||
| 1371 | if (method_exists($node, 'removeAttribute')) { |
||
| 1372 | $presentational_attributes = ['align', 'background', 'bgcolor', 'border', 'cellpadding', 'cellspacing', 'frame', 'hspace', 'rules', 'style', 'valign', 'vspace']; |
||
| 1373 | // Remove `style` and deprecated presentational attributes |
||
| 1374 | foreach ($presentational_attributes as $presentational_attribute) { |
||
| 1375 | $node->removeAttribute($presentational_attribute); |
||
| 1376 | } |
||
| 1377 | |||
| 1378 | $deprecated_size_attribute_elems = ['table', 'th', 'td', 'hr', 'pre']; |
||
| 1379 | if (property_exists($node, 'tagName') && in_array($node->tagName, $deprecated_size_attribute_elems)) { |
||
| 1380 | $node->removeAttribute('width'); |
||
| 1381 | $node->removeAttribute('height'); |
||
| 1382 | } |
||
| 1383 | } |
||
| 1384 | |||
| 1385 | $cur = $node->firstChild; |
||
| 1386 | while ($cur !== null) { |
||
| 1387 | $this->_cleanStyles($cur); |
||
| 1388 | $cur = $cur->nextSibling; |
||
| 1389 | } |
||
| 1390 | } |
||
| 1391 | |||
| 1392 | /** |
||
| 1393 | * Clean out elements whose id/class combinations match specific string. |
||
| 1394 | * |
||
| 1395 | * @param $node DOMElement Node to clean |
||
| 1396 | * @param $regex string Match id/class combination. |
||
| 1397 | * |
||
| 1398 | * @return void |
||
| 1399 | **/ |
||
| 1400 | public function _cleanMatchedNodes($node, $regex) |
||
| 1401 | { |
||
| 1402 | $endOfSearchMarkerNode = NodeUtility::getNextNode($node, true); |
||
| 1403 | $next = NodeUtility::getNextNode($node); |
||
| 1404 | while ($next && $next !== $endOfSearchMarkerNode) { |
||
| 1405 | if (preg_match($regex, sprintf('%s %s', $next->getAttribute('class'), $next->getAttribute('id')))) { |
||
| 1406 | $this->logger->debug(sprintf('Removing matched node with regex: \'%s\', node class was: \'%s\', id: \'%s\'', $regex, $next->getAttribute('class'), $next->getAttribute('id'))); |
||
| 1407 | $next = NodeUtility::removeAndGetNext($next); |
||
| 1408 | } else { |
||
| 1409 | $next = NodeUtility::getNextNode($next); |
||
| 1410 | } |
||
| 1411 | } |
||
| 1412 | } |
||
| 1413 | |||
| 1414 | /** |
||
| 1415 | * @param DOMDocument $article |
||
| 1416 | * |
||
| 1417 | * @return void |
||
| 1418 | */ |
||
| 1419 | public function _cleanExtraParagraphs(DOMDocument $article) |
||
| 1420 | { |
||
| 1421 | $paragraphs = $article->getElementsByTagName('p'); |
||
| 1422 | $length = $paragraphs->length; |
||
| 1423 | |||
| 1424 | for ($i = 0; $i < $length; $i++) { |
||
| 1425 | $paragraph = $paragraphs->item($length - 1 - $i); |
||
| 1426 | |||
| 1427 | $imgCount = $paragraph->getElementsByTagName('img')->length; |
||
| 1428 | $embedCount = $paragraph->getElementsByTagName('embed')->length; |
||
| 1429 | $objectCount = $paragraph->getElementsByTagName('object')->length; |
||
| 1430 | // At this point, nasty iframes have been removed, only remain embedded video ones. |
||
| 1431 | $iframeCount = $paragraph->getElementsByTagName('iframe')->length; |
||
| 1432 | $totalCount = $imgCount + $embedCount + $objectCount + $iframeCount; |
||
| 1433 | |||
| 1434 | if ($totalCount === 0 && !preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $paragraph->textContent)) { |
||
| 1435 | $this->logger->debug(sprintf('[PrepArticle] Removing extra paragraph. Text content was: \'%s\'', substr($paragraph->textContent, 0, 128))); |
||
| 1436 | $paragraph->parentNode->removeChild($paragraph); |
||
| 1437 | } |
||
| 1438 | } |
||
| 1439 | } |
||
| 1440 | |||
| 1441 | /** |
||
| 1442 | * @param DOMDocument $article |
||
| 1443 | * @param string $tag Tag to clean conditionally |
||
| 1444 | * |
||
| 1445 | * @return void |
||
| 1446 | */ |
||
| 1447 | public function _cleanConditionally(DOMDocument $article, $tag) |
||
| 1448 | { |
||
| 1449 | if (!$this->configuration->getCleanConditionally()) { |
||
| 1450 | return; |
||
| 1451 | } |
||
| 1452 | |||
| 1453 | $isList = in_array($tag, ['ul', 'ol']); |
||
| 1454 | |||
| 1455 | /* |
||
| 1456 | * Gather counts for other typical elements embedded within. |
||
| 1457 | * Traverse backwards so we can remove nodes at the same time |
||
| 1458 | * without effecting the traversal. |
||
| 1459 | */ |
||
| 1460 | |||
| 1461 | $DOMNodeList = $article->getElementsByTagName($tag); |
||
| 1462 | $length = $DOMNodeList->length; |
||
| 1463 | for ($i = 0; $i < $length; $i++) { |
||
| 1464 | /** @var $node DOMElement */ |
||
| 1465 | $node = $DOMNodeList->item($length - 1 - $i); |
||
| 1466 | |||
| 1467 | // First check if we're in a data table, in which case don't remove us. |
||
| 1468 | if ($node->hasAncestorTag('table', -1, function($node) { |
||
| 1469 | return $node->isReadabilityDataTable(); |
||
| 1470 | })) { |
||
| 1471 | continue; |
||
| 1472 | } |
||
| 1473 | |||
| 1474 | $weight = 0; |
||
| 1475 | if ($this->configuration->getWeightClasses()) { |
||
| 1476 | $weight = $node->getClassWeight(); |
||
| 1477 | } |
||
| 1478 | |||
| 1479 | if ($weight < 0) { |
||
| 1480 | $this->logger->debug(sprintf('[PrepArticle] Removing tag \'%s\' with 0 or less weight', $tag)); |
||
| 1481 | |||
| 1482 | NodeUtility::removeNode($node); |
||
| 1483 | continue; |
||
| 1484 | } |
||
| 1485 | |||
| 1486 | if (substr_count($node->getTextContent(), ',') < 10) { |
||
| 1487 | /* |
||
| 1488 | * If there are not very many commas, and the number of |
||
| 1489 | * non-paragraph elements is more than paragraphs or other |
||
| 1490 | * ominous signs, remove the element. |
||
| 1491 | */ |
||
| 1492 | |||
| 1493 | $p = $node->getElementsByTagName('p')->length; |
||
| 1494 | $img = $node->getElementsByTagName('img')->length; |
||
| 1495 | $li = $node->getElementsByTagName('li')->length - 100; |
||
| 1496 | $input = $node->getElementsByTagName('input')->length; |
||
| 1497 | |||
| 1498 | $embedCount = 0; |
||
| 1499 | $embeds = $node->getElementsByTagName('embed'); |
||
| 1500 | |||
| 1501 | foreach ($embeds as $embedNode) { |
||
| 1502 | if (preg_match(NodeUtility::$regexps['videos'], $embedNode->C14N())) { |
||
| 1503 | $embedCount++; |
||
| 1504 | } |
||
| 1505 | } |
||
| 1506 | |||
| 1507 | $linkDensity = $node->getLinkDensity(); |
||
| 1508 | $contentLength = mb_strlen($node->getTextContent(true)); |
||
| 1509 | |||
| 1510 | $haveToRemove = |
||
| 1511 | ($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag('figure')) || |
||
| 1512 | (!$isList && $li > $p) || |
||
| 1513 | ($input > floor($p / 3)) || |
||
| 1514 | (!$isList && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag('figure')) || |
||
| 1515 | (!$isList && $weight < 25 && $linkDensity > 0.2) || |
||
| 1516 | ($weight >= 25 && $linkDensity > 0.5) || |
||
| 1517 | (($embedCount === 1 && $contentLength < 75) || $embedCount > 1); |
||
| 1518 | |||
| 1519 | if ($haveToRemove) { |
||
| 1520 | $this->logger->debug(sprintf('[PrepArticle] Removing tag \'%s\'.', $tag)); |
||
| 1521 | |||
| 1522 | NodeUtility::removeNode($node); |
||
| 1523 | } |
||
| 1524 | } |
||
| 1525 | } |
||
| 1526 | } |
||
| 1527 | |||
| 1528 | /** |
||
| 1529 | * Clean a node of all elements of type "tag". |
||
| 1530 | * (Unless it's a youtube/vimeo video. People love movies.). |
||
| 1531 | * |
||
| 1532 | * @param $article DOMDocument |
||
| 1533 | * @param $tag string tag to clean |
||
| 1534 | * |
||
| 1535 | * @return void |
||
| 1536 | **/ |
||
| 1537 | public function _clean(DOMDocument $article, $tag) |
||
| 1538 | { |
||
| 1539 | $isEmbed = in_array($tag, ['object', 'embed', 'iframe']); |
||
| 1540 | |||
| 1541 | $DOMNodeList = $article->getElementsByTagName($tag); |
||
| 1542 | $length = $DOMNodeList->length; |
||
| 1543 | for ($i = 0; $i < $length; $i++) { |
||
| 1544 | $item = $DOMNodeList->item($length - 1 - $i); |
||
| 1545 | |||
| 1546 | // Allow youtube and vimeo videos through as people usually want to see those. |
||
| 1547 | if ($isEmbed) { |
||
| 1548 | $attributeValues = []; |
||
| 1549 | foreach ($item->attributes as $value) { |
||
| 1550 | $attributeValues[] = $value->nodeValue; |
||
| 1551 | } |
||
| 1552 | $attributeValues = implode('|', $attributeValues); |
||
| 1553 | |||
| 1554 | // First, check the elements attributes to see if any of them contain youtube or vimeo |
||
| 1555 | if (preg_match(NodeUtility::$regexps['videos'], $attributeValues)) { |
||
| 1556 | continue; |
||
| 1557 | } |
||
| 1558 | |||
| 1559 | // Then check the elements inside this element for the same. |
||
| 1560 | if (preg_match(NodeUtility::$regexps['videos'], $item->C14N())) { |
||
| 1561 | continue; |
||
| 1562 | } |
||
| 1563 | } |
||
| 1564 | $this->logger->debug(sprintf('[PrepArticle] Removing node \'%s\'.', $item->tagName)); |
||
| 1565 | |||
| 1566 | NodeUtility::removeNode($item); |
||
| 1567 | } |
||
| 1568 | } |
||
| 1569 | |||
| 1570 | /** |
||
| 1571 | * Clean out spurious headers from an Element. Checks things like classnames and link density. |
||
| 1572 | * |
||
| 1573 | * @param DOMDocument $article |
||
| 1574 | * |
||
| 1575 | * @return void |
||
| 1576 | **/ |
||
| 1577 | public function _cleanHeaders(DOMDocument $article) |
||
| 1578 | { |
||
| 1579 | for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) { |
||
| 1580 | $headers = $article->getElementsByTagName('h'.$headerIndex); |
||
| 1581 | /** @var $header DOMElement */ |
||
| 1582 | foreach ($headers as $header) { |
||
| 1583 | $weight = 0; |
||
| 1584 | if ($this->configuration->getWeightClasses()) { |
||
| 1585 | $weight = $header->getClassWeight(); |
||
| 1586 | } |
||
| 1587 | |||
| 1588 | if ($weight < 0) { |
||
| 1589 | $this->logger->debug(sprintf('[PrepArticle] Removing H node with 0 or less weight. Content was: \'%s\'', substr($header->nodeValue, 0, 128))); |
||
| 1590 | |||
| 1591 | NodeUtility::removeNode($header); |
||
| 1592 | } |
||
| 1593 | } |
||
| 1594 | } |
||
| 1595 | } |
||
| 1596 | |||
| 1597 | /** |
||
| 1598 | * Removes the class="" attribute from every element in the given |
||
| 1599 | * subtree. |
||
| 1600 | * |
||
| 1601 | * Readability.js has a special filter to avoid cleaning the classes that the algorithm adds. We don't add classes |
||
| 1602 | * here so no need to filter those. |
||
| 1603 | * |
||
| 1604 | * @param DOMDocument|DOMNode $node |
||
| 1605 | * |
||
| 1606 | * @return void |
||
| 1607 | **/ |
||
| 1608 | public function _cleanClasses($node) |
||
| 1609 | { |
||
| 1610 | if ($node->getAttribute('class') !== '') { |
||
| 1611 | $node->removeAttribute('class'); |
||
| 1612 | } |
||
| 1613 | |||
| 1614 | for ($node = $node->getFirstElementChild(); $node !== null; $node = $node->nextSibling) { |
||
| 1615 | $this->_cleanClasses($node); |
||
| 1616 | } |
||
| 1617 | } |
||
| 1618 | |||
| 1619 | /** |
||
| 1620 | * @param DOMDocument $article |
||
| 1621 | * |
||
| 1622 | * @return DOMDocument |
||
| 1623 | */ |
||
| 1624 | public function postProcessContent(DOMDocument $article) |
||
| 1625 | { |
||
| 1626 | $this->logger->info('[PostProcess] PostProcessing content...'); |
||
| 1627 | |||
| 1628 | // Readability cannot open relative uris so we convert them to absolute uris. |
||
| 1629 | if ($this->configuration->getFixRelativeURLs()) { |
||
| 1630 | foreach (iterator_to_array($article->getElementsByTagName('a')) as $link) { |
||
| 1631 | /** @var DOMElement $link */ |
||
| 1632 | $href = $link->getAttribute('href'); |
||
| 1633 | if ($href) { |
||
| 1634 | // Replace links with javascript: URIs with text content, since |
||
| 1635 | // they won't work after scripts have been removed from the page. |
||
| 1636 | if (strpos($href, 'javascript:') === 0) { |
||
| 1637 | $this->logger->debug(sprintf('[PostProcess] Removing \'javascript:\' link. Content is: \'%s\'', substr($link->textContent, 0, 128))); |
||
| 1638 | |||
| 1639 | $text = $article->createTextNode($link->textContent); |
||
| 1640 | $link->parentNode->replaceChild($text, $link); |
||
| 1641 | } else { |
||
| 1642 | $this->logger->debug(sprintf('[PostProcess] Converting link to absolute URI: \'%s\'', substr($href, 0, 128))); |
||
| 1643 | |||
| 1644 | $link->setAttribute('href', $this->toAbsoluteURI($href)); |
||
| 1645 | } |
||
| 1646 | } |
||
| 1647 | } |
||
| 1648 | |||
| 1649 | foreach ($article->getElementsByTagName('img') as $img) { |
||
| 1650 | /** @var DOMElement $img */ |
||
| 1651 | /* |
||
| 1652 | * Extract all possible sources of img url and select the first one on the list. |
||
| 1653 | */ |
||
| 1654 | $url = [ |
||
| 1655 | $img->getAttribute('src'), |
||
| 1656 | $img->getAttribute('data-src'), |
||
| 1657 | $img->getAttribute('data-original'), |
||
| 1658 | $img->getAttribute('data-orig'), |
||
| 1659 | $img->getAttribute('data-url') |
||
| 1660 | ]; |
||
| 1661 | |||
| 1662 | $src = array_filter($url); |
||
| 1663 | $src = reset($src); |
||
| 1664 | if ($src) { |
||
| 1665 | $this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($src, 0, 128))); |
||
| 1666 | |||
| 1667 | $img->setAttribute('src', $this->toAbsoluteURI($src)); |
||
| 1668 | } |
||
| 1669 | } |
||
| 1670 | } |
||
| 1671 | |||
| 1672 | $this->_cleanClasses($article); |
||
| 1673 | |||
| 1674 | return $article; |
||
| 1675 | } |
||
| 1676 | |||
| 1677 | /** |
||
| 1678 | * @return null|string |
||
| 1679 | */ |
||
| 1680 | public function __toString() |
||
| 1681 | { |
||
| 1682 | return sprintf('<h1>%s</h1>%s', $this->getTitle(), $this->getContent()); |
||
| 1683 | } |
||
| 1684 | |||
| 1685 | /** |
||
| 1686 | * @return string|null |
||
| 1687 | */ |
||
| 1688 | public function getTitle() |
||
| 1689 | { |
||
| 1690 | return $this->title; |
||
| 1691 | } |
||
| 1692 | |||
| 1693 | /** |
||
| 1694 | * @param string $title |
||
| 1695 | */ |
||
| 1696 | protected function setTitle($title) |
||
| 1697 | { |
||
| 1698 | $this->title = $title; |
||
| 1699 | } |
||
| 1700 | |||
| 1701 | /** |
||
| 1702 | * @return string|null |
||
| 1703 | */ |
||
| 1704 | public function getContent() |
||
| 1705 | { |
||
| 1706 | return ($this->content instanceof DOMDocument) ? $this->content->C14N() : null; |
||
| 1707 | } |
||
| 1708 | |||
| 1709 | /** |
||
| 1710 | * @return DOMDocument|null |
||
| 1711 | */ |
||
| 1712 | public function getDOMDocument() |
||
| 1713 | { |
||
| 1714 | return $this->content; |
||
| 1715 | } |
||
| 1716 | |||
| 1717 | /** |
||
| 1718 | * @param DOMDocument $content |
||
| 1719 | */ |
||
| 1720 | protected function setContent(DOMDocument $content) |
||
| 1721 | { |
||
| 1722 | $this->content = $content; |
||
| 1723 | } |
||
| 1724 | |||
| 1725 | /** |
||
| 1726 | * @return null|string |
||
| 1727 | */ |
||
| 1728 | public function getExcerpt() |
||
| 1729 | { |
||
| 1730 | return $this->excerpt; |
||
| 1731 | } |
||
| 1732 | |||
| 1733 | /** |
||
| 1734 | * @param null|string $excerpt |
||
| 1735 | */ |
||
| 1736 | public function setExcerpt($excerpt) |
||
| 1737 | { |
||
| 1738 | $this->excerpt = $excerpt; |
||
| 1739 | } |
||
| 1740 | |||
| 1741 | /** |
||
| 1742 | * @return string|null |
||
| 1743 | */ |
||
| 1744 | public function getImage() |
||
| 1745 | { |
||
| 1746 | return $this->image; |
||
| 1747 | } |
||
| 1748 | |||
| 1749 | /** |
||
| 1750 | * @param string $image |
||
| 1751 | */ |
||
| 1752 | protected function setImage($image) |
||
| 1755 | } |
||
| 1756 | |||
| 1757 | /** |
||
| 1758 | * @return string|null |
||
| 1759 | */ |
||
| 1760 | public function getAuthor() |
||
| 1761 | { |
||
| 1762 | return $this->author; |
||
| 1763 | } |
||
| 1764 | |||
| 1765 | /** |
||
| 1766 | * @param string $author |
||
| 1767 | */ |
||
| 1768 | protected function setAuthor($author) |
||
| 1769 | { |
||
| 1770 | $this->author = $author; |
||
| 1771 | } |
||
| 1772 | |||
| 1773 | /** |
||
| 1774 | * @return string|null |
||
| 1775 | */ |
||
| 1776 | public function getSiteName() |
||
| 1777 | { |
||
| 1778 | return $this->siteName; |
||
| 1779 | } |
||
| 1780 | |||
| 1781 | /** |
||
| 1782 | * @param string $siteName |
||
| 1783 | */ |
||
| 1784 | protected function setSiteName($siteName) |
||
| 1785 | { |
||
| 1786 | $this->siteName = $siteName; |
||
| 1787 | } |
||
| 1788 | |||
| 1789 | /** |
||
| 1790 | * @return null|string |
||
| 1791 | */ |
||
| 1792 | public function getDirection() |
||
| 1795 | } |
||
| 1796 | |||
| 1797 | /** |
||
| 1798 | * @param null|string $direction |
||
| 1799 | */ |
||
| 1800 | public function setDirection($direction) |
||
| 1801 | { |
||
| 1802 | $this->direction = $direction; |
||
| 1803 | } |
||
| 1804 | } |
||
| 1805 |