Readability::getArticleTitle()   D
last analyzed

Complexity

Conditions 19
Paths 57

Size

Total Lines 97
Code Lines 47

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 19
eloc 47
c 0
b 0
f 0
nc 57
nop 0
dl 0
loc 97
rs 4.5166

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
namespace andreskrey\Readability;
4
5
use andreskrey\Readability\Nodes\DOM\DOMDocument;
6
use andreskrey\Readability\Nodes\DOM\DOMElement;
7
use andreskrey\Readability\Nodes\DOM\DOMNode;
8
use andreskrey\Readability\Nodes\DOM\DOMText;
9
use andreskrey\Readability\Nodes\NodeUtility;
10
use Psr\Log\LoggerInterface;
11
12
/**
13
 * Class Readability.
14
 */
15
class Readability
16
{
17
    /**
18
     * Main DOMDocument where all the magic happens.
19
     *
20
     * @var DOMDocument
21
     */
22
    protected $dom;
23
24
    /**
25
     * Title of the article.
26
     *
27
     * @var string|null
28
     */
29
    protected $title = null;
30
31
    /**
32
     * Final DOMDocument with the fully parsed HTML.
33
     *
34
     * @var DOMDocument|null
35
     */
36
    protected $content = null;
37
38
    /**
39
     * Excerpt of the article.
40
     *
41
     * @var string|null
42
     */
43
    protected $excerpt = null;
44
45
    /**
46
     * Main image of the article.
47
     *
48
     * @var string|null
49
     */
50
    protected $image = null;
51
52
    /**
53
     * Author of the article. Extracted from the byline tags and other social media properties.
54
     *
55
     * @var string|null
56
     */
57
    protected $author = null;
58
59
    /**
60
     * Website name.
61
     *
62
     * @var string|null
63
     */
64
    protected $siteName = null;
65
66
    /**
67
     * Direction of the text.
68
     *
69
     * @var string|null
70
     */
71
    protected $direction = null;
72
73
    /**
74
     * Configuration object.
75
     *
76
     * @var Configuration
77
     */
78
    private $configuration;
79
80
    /**
81
     * Logger object.
82
     *
83
     * @var LoggerInterface
84
     */
85
    private $logger;
86
87
    /**
88
     * Collection of attempted text extractions.
89
     *
90
     * @var array
91
     */
92
    private $attempts = [];
93
94
    /**
95
     * @var array
96
     */
97
    private $defaultTagsToScore = [
98
        'section',
99
        'h2',
100
        'h3',
101
        'h4',
102
        'h5',
103
        'h6',
104
        'p',
105
        'td',
106
        'pre',
107
    ];
108
109
    /**
110
     * @var array
111
     */
112
    private $alterToDIVExceptions = [
113
        'div',
114
        'article',
115
        'section',
116
        'p',
117
    ];
118
119
    /**
120
     * Readability constructor.
121
     *
122
     * @param Configuration $configuration
123
     */
124
    public function __construct(Configuration $configuration)
125
    {
126
        $this->configuration = $configuration;
127
        $this->logger = $this->configuration->getLogger();
128
    }
129
130
    /**
131
     * Main parse function.
132
     *
133
     * @param $html
134
     *
135
     * @throws ParseException
136
     *
137
     * @return bool
138
     */
139
    public function parse($html)
140
    {
141
        $this->logger->info('*** Starting parse process...');
142
143
        $this->dom = $this->loadHTML($html);
144
145
        // Checking for minimum HTML to work with.
146
        if (!($root = $this->dom->getElementsByTagName('body')->item(0)) || !$root->firstChild) {
147
            $this->logger->emergency('No body tag present or body tag empty');
148
149
            throw new ParseException('Invalid or incomplete HTML.');
150
        }
151
152
        $this->getMetadata();
153
154
        $this->getMainImage();
155
156
        while (true) {
157
            $root = $root->firstChild;
158
159
            $elementsToScore = $this->getNodes($root);
160
            $this->logger->debug(sprintf('Elements to score: \'%s\'', count($elementsToScore)));
161
162
            $result = $this->rateNodes($elementsToScore);
163
164
            /*
165
             * Now that we've gone through the full algorithm, check to see if
166
             * we got any meaningful content. If we didn't, we may need to re-run
167
             * grabArticle with different flags set. This gives us a higher likelihood of
168
             * finding the content, and the sieve approach gives us a higher likelihood of
169
             * finding the -right- content.
170
             */
171
172
            $length = mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->textContent));
173
174
            $this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getCharThreshold()));
175
176
            if ($result && $length < $this->configuration->getCharThreshold()) {
177
                $this->dom = $this->loadHTML($html);
178
                $root = $this->dom->getElementsByTagName('body')->item(0);
179
180
                if ($this->configuration->getStripUnlikelyCandidates()) {
181
                    $this->logger->debug('[Parsing] Threshold not met, trying again setting StripUnlikelyCandidates as false');
182
                    $this->configuration->setStripUnlikelyCandidates(false);
183
                    $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
184
                } elseif ($this->configuration->getWeightClasses()) {
185
                    $this->logger->debug('[Parsing] Threshold not met, trying again setting WeightClasses as false');
186
                    $this->configuration->setWeightClasses(false);
187
                    $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
188
                } elseif ($this->configuration->getCleanConditionally()) {
189
                    $this->logger->debug('[Parsing] Threshold not met, trying again setting CleanConditionally as false');
190
                    $this->configuration->setCleanConditionally(false);
191
                    $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
192
                } else {
193
                    $this->logger->debug('[Parsing] Threshold not met, searching across attempts for some content.');
194
                    $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];
195
196
                    // No luck after removing flags, just return the longest text we found during the different loops
197
                    usort($this->attempts, function($a, $b) {
198
                        return $a['textLength'] < $b['textLength'];
199
                    });
200
201
                    // But first check if we actually have something
202
                    if (!$this->attempts[0]['textLength']) {
203
                        $this->logger->emergency('[Parsing] Could not parse text, giving up :(');
204
205
                        throw new ParseException('Could not parse text.');
206
                    }
207
208
                    $this->logger->debug('[Parsing] Threshold not met, but found some content in previous attempts.');
209
210
                    $result = $this->attempts[0]['articleContent'];
211
                    break;
212
                }
213
            } else {
214
                break;
215
            }
216
        }
217
218
        $result = $this->postProcessContent($result);
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $result does not seem to be defined for all execution paths leading up to this point.
Loading history...
219
220
        // If we haven't found an excerpt in the article's metadata, use the article's
221
        // first paragraph as the excerpt. This can be used for displaying a preview of
222
        // the article's content.
223
        if (!$this->getExcerpt()) {
224
            $this->logger->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.');
225
            $paragraphs = $result->getElementsByTagName('p');
226
            if ($paragraphs->length > 0) {
227
                $this->setExcerpt(trim($paragraphs->item(0)->textContent));
228
            }
229
        }
230
231
        $this->setContent($result);
232
233
        $this->logger->info('*** Parse successful :)');
234
235
        return true;
236
    }
237
238
    /**
239
     * Creates a DOM Document object and loads the provided HTML on it.
240
     *
241
     * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text)
242
     * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs
243
     * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both
244
     * objects and ruining the backup.
245
     *
246
     * @param string $html
247
     *
248
     * @return DOMDocument
249
     */
250
    private function loadHTML($html)
251
    {
252
        $this->logger->debug('[Loading] Loading HTML...');
253
254
        // To avoid throwing a gazillion of errors on malformed HTMLs
255
        libxml_use_internal_errors(true);
256
257
        $dom = new DOMDocument('1.0', 'utf-8');
258
259
        if (!$this->configuration->getSubstituteEntities()) {
260
            // Keep the original HTML entities
261
            $dom->substituteEntities = false;
262
        }
263
264
        if ($this->configuration->getNormalizeEntities()) {
265
            $this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.');
266
            // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content
267
            $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
268
        }
269
270
        if ($this->configuration->getSummonCthulhu()) {
271
            $this->logger->debug('[Loading] Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘');
272
            $html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html);
273
        }
274
275
        // Prepend the XML tag to avoid having issues with special characters. Should be harmless.
276
        $dom->loadHTML('<?xml encoding="UTF-8">'.$html);
277
        $dom->encoding = 'UTF-8';
278
279
        $this->removeScripts($dom);
280
281
        $this->prepDocument($dom);
282
283
        $this->logger->debug('[Loading] Loaded HTML successfully.');
284
285
        return $dom;
286
    }
287
288
    /**
289
     * Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties.
290
     */
291
    private function getMetadata()
292
    {
293
        $this->logger->debug('[Metadata] Retrieving metadata...');
294
295
        $values = [];
296
        // property is a space-separated list of values
297
        $propertyPattern = '/\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|image|site_name)(?!:)\s*/i';
298
299
        // name is a single value
300
        $namePattern = '/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image|site_name)(?!:)\s*$/i';
301
302
        // Find description tags.
303
        foreach ($this->dom->getElementsByTagName('meta') as $meta) {
304
            /* @var DOMNode $meta */
305
            $elementName = $meta->getAttribute('name');
306
            $elementProperty = $meta->getAttribute('property');
307
            $content = $meta->getAttribute('content');
308
            $matches = null;
309
            $name = null;
310
311
            if ($elementProperty) {
312
                if (preg_match($propertyPattern, $elementProperty, $matches)) {
313
                    for ($i = count($matches) - 1; $i >= 0; $i--) {
314
                        // Convert to lowercase, and remove any whitespace
315
                        // so we can match below.
316
                        $name = preg_replace('/\s/', '', mb_strtolower($matches[$i]));
317
                        // multiple authors
318
                        $values[$name] = trim($content);
319
                    }
320
                }
321
            }
322
323
            if (!$matches && $elementName && preg_match($namePattern, $elementName)) {
324
                $name = $elementName;
325
                if ($content) {
326
                    // Convert to lowercase, remove any whitespace, and convert dots
327
                    // to colons so we can match below.
328
                    $name = preg_replace(['/\s/', '/\./'], ['', ':'], mb_strtolower($name));
329
                    $values[$name] = trim($content);
330
                }
331
            }
332
        }
333
334
        // get title
335
        /*
336
         * This is a very convoluted way of extracting the first matching key of the $values array
337
         * against a set of options.
338
         *
339
         * This could be easily replaced with an ugly set of isset($values['key']) or a bunch of ??s.
340
         * Will probably replace it with ??s after dropping support of PHP5.6
341
         */
342
        $key = current(array_intersect([
343
            'dc:title',
344
            'dcterm:title',
345
            'og:title',
346
            'weibo:article:title',
347
            'weibo:webpage:title',
348
            'title',
349
            'twitter:title'
350
        ], array_keys($values)));
351
352
        $this->setTitle(isset($values[$key]) ? trim($values[$key]) : null);
353
354
        if (!$this->getTitle()) {
355
            $this->setTitle($this->getArticleTitle());
356
        }
357
358
        // get author
359
        $key = current(array_intersect([
360
            'dc:creator',
361
            'dcterm:creator',
362
            'author'
363
        ], array_keys($values)));
364
365
        $this->setAuthor(isset($values[$key]) ? $values[$key] : null);
366
367
        // get description
368
        $key = current(array_intersect([
369
            'dc:description',
370
            'dcterm:description',
371
            'og:description',
372
            'weibo:article:description',
373
            'weibo:webpage:description',
374
            'description',
375
            'twitter:description'
376
        ], array_keys($values)));
377
378
        $this->setExcerpt(isset($values[$key]) ? $values[$key] : null);
379
380
        // get main image
381
        $key = current(array_intersect([
382
            'image',
383
            'og:image',
384
            'twitter:image'
385
        ], array_keys($values)));
386
387
        $this->setImage(isset($values[$key]) ? $values[$key] : null);
388
389
        $key = current(array_intersect([
390
            'og:site_name'
391
        ], array_keys($values)));
392
393
        $this->setSiteName(isset($values[$key]) ? $values[$key] : null);
394
    }
395
396
    /**
397
     * Returns all the images of the parsed article.
398
     *
399
     * @return array
400
     */
401
    public function getImages()
402
    {
403
        $result = [];
404
        if ($this->getImage()) {
405
            $result[] = $this->getImage();
406
        }
407
408
        if (null == $this->getDOMDocument()) {
409
            return $result;
410
        }
411
412
        foreach ($this->getDOMDocument()->getElementsByTagName('img') as $img) {
413
            if ($src = $img->getAttribute('src')) {
414
                $result[] = $src;
415
            }
416
        }
417
418
        if ($this->configuration->getFixRelativeURLs()) {
419
            foreach ($result as &$imgSrc) {
420
                $imgSrc = $this->toAbsoluteURI($imgSrc);
421
            }
422
        }
423
424
        $result = array_unique(array_filter($result));
425
426
        return $result;
427
    }
428
429
    /**
430
     * Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't
431
     * find a correct image.
432
     */
433
    public function getMainImage()
434
    {
435
        $imgUrl = false;
436
437
        if ($this->getImage() !== null) {
438
            $imgUrl = $this->getImage();
439
        }
440
441
        if (!$imgUrl) {
442
            foreach ($this->dom->getElementsByTagName('link') as $link) {
443
                /** @var \DOMElement $link */
444
                /*
445
                 * Check for the rel attribute, then check if the rel attribute is either img_src or image_src, and
446
                 * finally check for the existence of the href attribute, which should hold the image url.
447
                 */
448
                if ($link->hasAttribute('rel') && ($link->getAttribute('rel') === 'img_src' || $link->getAttribute('rel') === 'image_src') && $link->hasAttribute('href')) {
449
                    $imgUrl = $link->getAttribute('href');
450
                    break;
451
                }
452
            }
453
        }
454
455
        if (!empty($imgUrl) && $this->configuration->getFixRelativeURLs()) {
456
            $this->setImage($this->toAbsoluteURI($imgUrl));
457
        }
458
    }
459
460
    /**
461
     * Returns the title of the html. Prioritizes the title from the metadata against the title tag.
462
     *
463
     * @return string|null
464
     */
465
    private function getArticleTitle()
466
    {
467
        $originalTitle = null;
468
469
        if ($this->getTitle()) {
470
            $originalTitle = $this->getTitle();
471
        } else {
472
            $this->logger->debug('[Metadata] Could not find title in metadata, searching for the title tag...');
473
            $titleTag = $this->dom->getElementsByTagName('title');
474
            if ($titleTag->length > 0) {
475
                $this->logger->info(sprintf('[Metadata] Using title tag as article title: \'%s\'', $titleTag->item(0)->nodeValue));
476
                $originalTitle = $titleTag->item(0)->nodeValue;
477
            }
478
        }
479
480
        if ($originalTitle === null) {
481
            return null;
482
        }
483
484
        $curTitle = $originalTitle = trim($originalTitle);
485
        $titleHadHierarchicalSeparators = false;
486
487
        /*
488
         * If there's a separator in the title, first remove the final part
489
         *
490
         * Sanity warning: if you eval this match in PHPStorm's "Evaluate expression" box, it will return false
491
         * I can assure you it works properly if you let the code run.
492
         */
493
        if (preg_match('/ [\|\-\\\\\/>»] /i', $curTitle)) {
494
            $titleHadHierarchicalSeparators = (bool) preg_match('/ [\\\\\/>»] /', $curTitle);
495
            $curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle);
496
497
            $this->logger->info(sprintf('[Metadata] Found hierarchical separators in title, new title is: \'%s\'', $curTitle));
498
499
            // If the resulting title is too short (3 words or fewer), remove
500
            // the first part instead:
501
            if (count(preg_split('/\s+/', $curTitle)) < 3) {
0 ignored issues
show
Bug introduced by
It seems like preg_split('/\s+/', $curTitle) can also be of type false; however, parameter $var of count() does only seem to accept Countable|array, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

501
            if (count(/** @scrutinizer ignore-type */ preg_split('/\s+/', $curTitle)) < 3) {
Loading history...
502
                $curTitle = preg_replace('/[^\|\-\\\\\/>»]*[\|\-\\\\\/>»](.*)/i', '$1', $originalTitle);
503
                $this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle));
504
            }
505
        } elseif (strpos($curTitle, ': ') !== false) {
506
            // Check if we have an heading containing this exact string, so we
507
            // could assume it's the full title.
508
            $match = false;
509
            for ($i = 1; $i <= 2; $i++) {
510
                foreach ($this->dom->getElementsByTagName('h'.$i) as $hTag) {
511
                    // Trim texts to avoid having false negatives when the title is surrounded by spaces or tabs
512
                    if (trim($hTag->nodeValue) === trim($curTitle)) {
513
                        $match = true;
514
                    }
515
                }
516
            }
517
518
            // If we don't, let's extract the title out of the original title string.
519
            if (!$match) {
520
                $curTitle = substr($originalTitle, strrpos($originalTitle, ':') + 1);
521
522
                $this->logger->info(sprintf('[Metadata] Title has a colon in the middle, new title is: \'%s\'', $curTitle));
523
524
                // If the title is now too short, try the first colon instead:
525
                if (count(preg_split('/\s+/', $curTitle)) < 3) {
526
                    $curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1);
527
                    $this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle));
528
                } elseif (count(preg_split('/\s+/', substr($curTitle, 0, strpos($curTitle, ':')))) > 5) {
529
                    // But if we have too many words before the colon there's something weird
530
                    // with the titles and the H tags so let's just use the original title instead
531
                    $curTitle = $originalTitle;
532
                }
533
            }
534
        } elseif (mb_strlen($curTitle) > 150 || mb_strlen($curTitle) < 15) {
535
            $hOnes = $this->dom->getElementsByTagName('h1');
536
537
            if ($hOnes->length === 1) {
538
                $curTitle = $hOnes->item(0)->nodeValue;
539
                $this->logger->info(sprintf('[Metadata] Using title from an H1 node: \'%s\'', $curTitle));
540
            }
541
        }
542
543
        $curTitle = trim($curTitle);
544
545
        /*
546
         * If we now have 4 words or fewer as our title, and either no
547
         * 'hierarchical' separators (\, /, > or ») were found in the original
548
         * title or we decreased the number of words by more than 1 word, use
549
         * the original title.
550
         */
551
        $curTitleWordCount = count(preg_split('/\s+/', $curTitle));
552
        $originalTitleWordCount = count(preg_split('/\s+/', preg_replace('/[\|\-\\\\\/>»]+/', '', $originalTitle))) - 1;
553
554
        if ($curTitleWordCount <= 4 &&
555
            (!$titleHadHierarchicalSeparators || $curTitleWordCount !== $originalTitleWordCount)) {
556
            $curTitle = $originalTitle;
557
558
            $this->logger->info(sprintf('Using title from an H1 node: \'%s\'', $curTitle));
559
        }
560
561
        return $curTitle;
562
    }
563
564
    /**
565
     * Convert URI to an absolute URI.
566
     *
567
     * @param $uri string URI to convert
568
     *
569
     * @return string
570
     */
571
    private function toAbsoluteURI($uri)
572
    {
573
        list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->configuration->getOriginalURL());
574
575
        // If this is already an absolute URI, return it.
576
        if (preg_match('/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/', $uri)) {
577
            return $uri;
578
        }
579
580
        // Scheme-rooted relative URI.
581
        if (substr($uri, 0, 2) === '//') {
582
            return $scheme.'://'.substr($uri, 2);
583
        }
584
585
        // Prepath-rooted relative URI.
586
        if (substr($uri, 0, 1) === '/') {
587
            return $prePath.$uri;
588
        }
589
590
        // Dotslash relative URI.
591
        if (strpos($uri, './') === 0) {
592
            return $pathBase.substr($uri, 2);
593
        }
594
        // Ignore hash URIs:
595
        if (substr($uri, 0, 1) === '#') {
596
            return $uri;
597
        }
598
599
        // Standard relative URI; add entire path. pathBase already includes a
600
        // trailing "/".
601
        return $pathBase.$uri;
602
    }
603
604
    /**
605
     * Returns full path info of an URL.
606
     *
607
     * @param  string $url
608
     *
609
     * @return array [$pathBase, $scheme, $prePath]
610
     */
611
    public function getPathInfo($url)
612
    {
613
        // Check for base URLs
614
        if ($this->dom->baseURI !== null) {
615
            if (substr($this->dom->baseURI, 0, 1) === '/') {
616
                // URLs starting with '/' override completely the URL defined in the link
617
                $pathBase = parse_url($url, PHP_URL_SCHEME).'://'.parse_url($url, PHP_URL_HOST).$this->dom->baseURI;
618
            } else {
619
                // Otherwise just prepend the base to the actual path
620
                $pathBase = parse_url($url, PHP_URL_SCHEME).'://'.parse_url($url, PHP_URL_HOST).dirname(parse_url($url, PHP_URL_PATH)).'/'.rtrim($this->dom->baseURI, '/').'/';
621
            }
622
        } else {
623
            $pathBase = parse_url($url, PHP_URL_SCHEME).'://'.parse_url($url, PHP_URL_HOST).dirname(parse_url($url, PHP_URL_PATH)).'/';
624
        }
625
626
        $scheme = parse_url($pathBase, PHP_URL_SCHEME);
627
        $prePath = $scheme.'://'.parse_url($pathBase, PHP_URL_HOST);
628
629
        return [$pathBase, $scheme, $prePath];
630
    }
631
632
    /**
633
     * Gets nodes from the root element.
634
     *
635
     * @param $node DOMNode|DOMText
636
     *
637
     * @return array
638
     */
639
    private function getNodes($node)
640
    {
641
        $this->logger->info('[Get Nodes] Retrieving nodes...');
642
643
        $stripUnlikelyCandidates = $this->configuration->getStripUnlikelyCandidates();
644
645
        $elementsToScore = [];
646
647
        /*
648
         * First, node prepping. Trash nodes that look cruddy (like ones with the
649
         * class name "comment", etc), and turn divs into P tags where they have been
650
         * used inappropriately (as in, where they contain no other block level elements.)
651
         */
652
653
        while ($node) {
654
            // Remove DOMComments nodes as we don't need them and mess up children counting
655
            if ($node->nodeType === XML_COMMENT_NODE) {
656
                $this->logger->debug(sprintf('[Get Nodes] Found comment node, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));
657
                $node = NodeUtility::removeAndGetNext($node);
658
                continue;
659
            }
660
661
            $matchString = $node->getAttribute('class').' '.$node->getAttribute('id');
662
663
            if (!$node->isProbablyVisible()) {
664
                $this->logger->debug(sprintf('[Get Nodes] Removing hidden node... Match string was: \'%s\'', $matchString));
665
                $node = NodeUtility::removeAndGetNext($node);
666
                continue;
667
            }
668
669
            // Check to see if this node is a byline, and remove it if it is.
670
            if ($this->checkByline($node, $matchString)) {
671
                $this->logger->debug(sprintf('[Get Nodes] Found byline, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));
672
                $node = NodeUtility::removeAndGetNext($node);
673
                continue;
674
            }
675
676
            // Remove unlikely candidates
677
            if ($stripUnlikelyCandidates) {
678
                if (
679
                    preg_match(NodeUtility::$regexps['unlikelyCandidates'], $matchString) &&
680
                    !preg_match(NodeUtility::$regexps['okMaybeItsACandidate'], $matchString) &&
681
                    $node->nodeName !== 'body' &&
682
                    $node->nodeName !== 'a'
683
                ) {
684
                    $this->logger->debug(sprintf('[Get Nodes] Removing unlikely candidate. Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));
685
                    $node = NodeUtility::removeAndGetNext($node);
686
                    continue;
687
                }
688
            }
689
690
            // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
691
            if (($node->nodeName === 'div' || $node->nodeName === 'section' || $node->nodeName === 'header' ||
692
                    $node->nodeName === 'h1' || $node->nodeName === 'h2' || $node->nodeName === 'h3' ||
693
                    $node->nodeName === 'h4' || $node->nodeName === 'h5' || $node->nodeName === 'h6' ||
694
                    $node->nodeName === 'p') &&
695
                $node->isElementWithoutContent()) {
696
                $this->logger->debug(sprintf('[Get Nodes] Removing empty \'%s\' node.', $node->nodeName));
697
                $node = NodeUtility::removeAndGetNext($node);
698
                continue;
699
            }
700
701
            if (in_array(strtolower($node->nodeName), $this->defaultTagsToScore)) {
702
                $this->logger->debug(sprintf('[Get Nodes] Adding node to score list, node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
703
                $elementsToScore[] = $node;
704
            }
705
706
            // Turn all divs that don't have children block level elements into p's
707
            if ($node->nodeName === 'div') {
708
                // Put phrasing content into paragraphs.
709
                $p = null;
710
                $childNode = $node->firstChild;
711
                while ($childNode) {
712
                    $nextSibling = $childNode->nextSibling;
713
                    if ($childNode->isPhrasingContent()) {
714
                        if ($p !== null) {
715
                            $p->appendChild($childNode);
716
                        } elseif (!$childNode->isWhitespace()) {
717
                            $p = $this->dom->createElement('p');
718
                            $node->replaceChild($p, $childNode);
719
                            $p->appendChild($childNode);
720
                        }
721
                    } elseif ($p !== null) {
722
                        while ($p->lastChild && $p->lastChild->isWhitespace()) {
0 ignored issues
show
introduced by
The method isWhitespace() does not exist on DOMElement. Are you sure you never get this type here, but always one of the subclasses? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

722
                        while ($p->lastChild && $p->lastChild->/** @scrutinizer ignore-call */ isWhitespace()) {
Loading history...
723
                            $p->removeChild($p->lastChild);
724
                        }
725
                        $p = null;
726
                    }
727
                    $childNode = $nextSibling;
728
                }
729
730
                /*
731
                 * Sites like http://mobile.slate.com encloses each paragraph with a DIV
732
                 * element. DIVs with only a P element inside and no text content can be
733
                 * safely converted into plain P elements to avoid confusing the scoring
734
                 * algorithm with DIVs with are, in practice, paragraphs.
735
                 */
736
                if ($node->hasSingleTagInsideElement('p') && $node->getLinkDensity() < 0.25) {
737
                    $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
738
                    $pNode = NodeUtility::filterTextNodes($node->childNodes)->item(0);
739
                    $node->parentNode->replaceChild($pNode, $node);
740
                    $node = $pNode;
741
                    $elementsToScore[] = $node;
742
                } elseif (!$node->hasSingleChildBlockElement()) {
743
                    $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single child block element, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
744
                    $node = NodeUtility::setNodeTag($node, 'p');
745
                    $elementsToScore[] = $node;
746
                }
747
            }
748
749
            $node = NodeUtility::getNextNode($node);
750
        }
751
752
        return $elementsToScore;
753
    }
754
755
    /**
756
     * Checks if the node is a byline.
757
     *
758
     * @param DOMNode $node
759
     * @param string $matchString
760
     *
761
     * @return bool
762
     */
763
    private function checkByline($node, $matchString)
764
    {
765
        if (!$this->configuration->getArticleByLine()) {
766
            return false;
767
        }
768
769
        /*
770
         * Check if the byline is already set
771
         */
772
        if ($this->getAuthor()) {
773
            return false;
774
        }
775
776
        $rel = $node->getAttribute('rel');
777
778
        if ($rel === 'author' || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent())) {
779
            $this->logger->info(sprintf('[Metadata] Found article author: \'%s\'', $node->getTextContent()));
780
            $this->setAuthor(trim($node->getTextContent()));
781
782
            return true;
783
        }
784
785
        return false;
786
    }
787
788
    /**
789
     * Checks the validity of a byLine. Based on string length.
790
     *
791
     * @param string $text
792
     *
793
     * @return bool
794
     */
795
    private function isValidByline($text)
796
    {
797
        if (gettype($text) == 'string') {
798
            $byline = trim($text);
799
800
            return (mb_strlen($byline) > 0) && (mb_strlen($byline) < 100);
801
        }
802
803
        return false;
804
    }
805
806
    /**
807
     * Removes all the scripts of the html.
808
     *
809
     * @param DOMDocument $dom
810
     */
811
    private function removeScripts(DOMDocument $dom)
812
    {
813
        foreach (['script', 'noscript'] as $tag) {
814
            $nodes = $dom->getElementsByTagName($tag);
815
            foreach (iterator_to_array($nodes) as $node) {
816
                NodeUtility::removeNode($node);
817
            }
818
        }
819
    }
820
821
    /**
822
     * Prepares the document for parsing.
823
     *
824
     * @param DOMDocument $dom
825
     */
826
    private function prepDocument(DOMDocument $dom)
827
    {
828
        $this->logger->info('[PrepDocument] Preparing document for parsing...');
829
830
        foreach ($dom->shiftingAwareGetElementsByTagName('br') as $br) {
831
            $next = $br->nextSibling;
832
833
            /*
834
             * Whether 2 or more <br> elements have been found and replaced with a
835
             * <p> block.
836
             */
837
            $replaced = false;
838
839
            /*
840
             * If we find a <br> chain, remove the <br>s until we hit another element
841
             * or non-whitespace. This leaves behind the first <br> in the chain
842
             * (which will be replaced with a <p> later).
843
             */
844
            while (($next = NodeUtility::nextElement($next)) && ($next->nodeName === 'br')) {
845
                $this->logger->debug('[PrepDocument] Removing chain of BR nodes...');
846
847
                $replaced = true;
848
                $brSibling = $next->nextSibling;
849
                $next->parentNode->removeChild($next);
850
                $next = $brSibling;
851
            }
852
853
            /*
854
             * If we removed a <br> chain, replace the remaining <br> with a <p>. Add
855
             * all sibling nodes as children of the <p> until we hit another <br>
856
             * chain.
857
             */
858
859
            if ($replaced) {
860
                $p = $dom->createElement('p');
861
                $br->parentNode->replaceChild($p, $br);
862
863
                $next = $p->nextSibling;
864
                while ($next) {
865
                    // If we've hit another <br><br>, we're done adding children to this <p>.
866
                    if ($next->nodeName === 'br') {
867
                        $nextElem = NodeUtility::nextElement($next->nextSibling);
868
                        if ($nextElem && $nextElem->nodeName === 'br') {
869
                            break;
870
                        }
871
                    }
872
873
                    if (!$next->isPhrasingContent()) {
0 ignored issues
show
introduced by
The method isPhrasingContent() does not exist on DOMElement. Are you sure you never get this type here, but always one of the subclasses? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

873
                    if (!$next->/** @scrutinizer ignore-call */ isPhrasingContent()) {
Loading history...
874
                        break;
875
                    }
876
877
                    $this->logger->debug('[PrepDocument] Replacing BR with a P node...');
878
879
                    // Otherwise, make this node a child of the new <p>.
880
                    $sibling = $next->nextSibling;
881
                    $p->appendChild($next);
882
                    $next = $sibling;
883
                }
884
885
                while ($p->lastChild && $p->lastChild->isWhitespace()) {
886
                    $p->removeChild($p->lastChild);
887
                }
888
889
                if ($p->parentNode->tagName === 'p') {
890
                    NodeUtility::setNodeTag($p->parentNode, 'div');
0 ignored issues
show
Bug introduced by
$p->parentNode of type DOMElement is incompatible with the type andreskrey\Readability\Nodes\DOM\DOMNode expected by parameter $node of andreskrey\Readability\N...deUtility::setNodeTag(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

890
                    NodeUtility::setNodeTag(/** @scrutinizer ignore-type */ $p->parentNode, 'div');
Loading history...
891
                }
892
            }
893
        }
894
895
        // Replace font tags with span
896
        $fonts = $dom->getElementsByTagName('font');
897
        $length = $fonts->length;
898
        for ($i = 0; $i < $length; $i++) {
899
            $this->logger->debug('[PrepDocument] Converting font tag into a span tag.');
900
            $font = $fonts->item($length - 1 - $i);
901
            NodeUtility::setNodeTag($font, 'span');
902
        }
903
    }
904
905
    /**
906
     * Assign scores to each node. Returns full article parsed or false on error.
907
     *
908
     * @param array $nodes
909
     *
910
     * @return DOMDocument|bool
911
     */
912
    private function rateNodes($nodes)
913
    {
914
        $this->logger->info('[Rating] Rating nodes...');
915
916
        $candidates = [];
917
918
        /** @var DOMElement $node */
919
        foreach ($nodes as $node) {
920
            if (is_null($node->parentNode)) {
921
                continue;
922
            }
923
924
            // Discard nodes with less than 25 characters, without blank space
925
            if (mb_strlen($node->getTextContent(true)) < 25) {
926
                continue;
927
            }
928
929
            $ancestors = $node->getNodeAncestors();
930
931
            // Exclude nodes with no ancestor
932
            if (count($ancestors) === 0) {
933
                continue;
934
            }
935
936
            // Start with a point for the paragraph itself as a base.
937
            $contentScore = 1;
938
939
            // Add points for any commas within this paragraph.
940
            $contentScore += count(explode(',', $node->getTextContent(true)));
941
942
            // For every 100 characters in this paragraph, add another point. Up to 3 points.
943
            $contentScore += min(floor(mb_strlen($node->getTextContent(true)) / 100), 3);
944
945
            $this->logger->debug(sprintf('[Rating] Node score %s, content: \'%s\'', $contentScore, substr($node->nodeValue, 0, 128)));
946
947
            /** @var $ancestor DOMElement */
948
            foreach ($ancestors as $level => $ancestor) {
949
                $this->logger->debug('[Rating] Found ancestor, initializing and adding it as a candidate...');
950
                if (!$ancestor->isInitialized()) {
951
                    $ancestor->initializeNode($this->configuration->getWeightClasses());
952
                    $candidates[] = $ancestor;
953
                }
954
955
                /*
956
                 * Node score divider:
957
                 *  - parent:             1 (no division)
958
                 *  - grandparent:        2
959
                 *  - great grandparent+: ancestor level * 3
960
                 */
961
962
                if ($level === 0) {
963
                    $scoreDivider = 1;
964
                } elseif ($level === 1) {
965
                    $scoreDivider = 2;
966
                } else {
967
                    $scoreDivider = $level * 3;
968
                }
969
970
                $currentScore = $ancestor->contentScore;
971
                $ancestor->contentScore = $currentScore + ($contentScore / $scoreDivider);
972
973
                $this->logger->debug(sprintf('[Rating] Ancestor score %s, value: \'%s\'', $ancestor->contentScore, substr($ancestor->nodeValue, 0, 128)));
974
            }
975
        }
976
977
        /*
978
         * After we've calculated scores, loop through all of the possible
979
         * candidate nodes we found and find the one with the highest score.
980
         */
981
982
        $topCandidates = [];
983
        foreach ($candidates as $candidate) {
984
985
            /*
986
             * Scale the final candidates score based on link density. Good content
987
             * should have a relatively small link density (5% or less) and be mostly
988
             * unaffected by this operation.
989
             */
990
991
            $candidate->contentScore = $candidate->contentScore * (1 - $candidate->getLinkDensity());
992
993
            for ($i = 0; $i < $this->configuration->getMaxTopCandidates(); $i++) {
994
                $aTopCandidate = isset($topCandidates[$i]) ? $topCandidates[$i] : null;
995
996
                if (!$aTopCandidate || $candidate->contentScore > $aTopCandidate->contentScore) {
997
                    array_splice($topCandidates, $i, 0, [$candidate]);
998
                    if (count($topCandidates) > $this->configuration->getMaxTopCandidates()) {
999
                        array_pop($topCandidates);
1000
                    }
1001
                    break;
1002
                }
1003
            }
1004
        }
1005
1006
        $topCandidate = isset($topCandidates[0]) ? $topCandidates[0] : null;
1007
        $parentOfTopCandidate = null;
1008
1009
        /*
1010
         * If we still have no top candidate, just use the body as a last resort.
1011
         * We also have to copy the body node so it is something we can modify.
1012
         */
1013
1014
        if ($topCandidate === null || $topCandidate->nodeName === 'body') {
1015
            $this->logger->info('[Rating] No top candidate found or top candidate is the body tag. Moving all child nodes to a new DIV node.');
1016
1017
            // Move all of the page's children into topCandidate
1018
            $topCandidate = new DOMDocument('1.0', 'utf-8');
1019
            $topCandidate->encoding = 'UTF-8';
1020
            $topCandidate->appendChild($topCandidate->createElement('div', ''));
1021
            $kids = $this->dom->getElementsByTagName('body')->item(0)->childNodes;
1022
1023
            // Cannot be foreached, don't ask me why.
1024
            for ($i = 0; $i < $kids->length; $i++) {
1025
                $import = $topCandidate->importNode($kids->item($i), true);
1026
                $topCandidate->firstChild->appendChild($import);
1027
            }
1028
1029
            // Candidate must be created using firstChild to grab the DOMElement instead of the DOMDocument.
1030
            $topCandidate = $topCandidate->firstChild;
1031
        } elseif ($topCandidate) {
1032
            $this->logger->info(sprintf('[Rating] Found top candidate, score: %s', $topCandidate->contentScore));
1033
            // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
1034
            // and whose scores are quite closed with current `topCandidate` node.
1035
            $alternativeCandidateAncestors = [];
1036
            for ($i = 1; $i < count($topCandidates); $i++) {
0 ignored issues
show
Performance Best Practice introduced by
It seems like you are calling the size function count() as part of the test condition. You might want to compute the size beforehand, and not on each iteration.

If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration:

for ($i=0; $i<count($array); $i++) { // calls count() on each iteration
}

// Better
for ($i=0, $c=count($array); $i<$c; $i++) { // calls count() just once
}
Loading history...
1037
                // In some cases we may end up with a top candidate with zero content score. To avoid dividing by zero
1038
                // we have to use max() and replace zero with a low value like 0.1
1039
                if ($topCandidates[$i]->contentScore / max($topCandidate->contentScore, 0.1) >= 0.75) {
1040
                    array_push($alternativeCandidateAncestors, $topCandidates[$i]->getNodeAncestors(false));
1041
                }
1042
            }
1043
1044
            $MINIMUM_TOPCANDIDATES = 3;
1045
            if (count($alternativeCandidateAncestors) >= $MINIMUM_TOPCANDIDATES) {
1046
                $parentOfTopCandidate = $topCandidate->parentNode;
1047
1048
                // Check if we are actually dealing with a DOMNode and not a DOMDocument node or higher
1049
                while ($parentOfTopCandidate->nodeName !== 'body' && $parentOfTopCandidate->nodeType === XML_ELEMENT_NODE) {
1050
                    $listsContainingThisAncestor = 0;
1051
                    for ($ancestorIndex = 0; $ancestorIndex < count($alternativeCandidateAncestors) && $listsContainingThisAncestor < $MINIMUM_TOPCANDIDATES; $ancestorIndex++) {
1052
                        $listsContainingThisAncestor += (int) in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex]);
1053
                    }
1054
                    if ($listsContainingThisAncestor >= $MINIMUM_TOPCANDIDATES) {
1055
                        $topCandidate = $parentOfTopCandidate;
1056
                        break;
1057
                    }
1058
                    $parentOfTopCandidate = $parentOfTopCandidate->parentNode;
1059
                }
1060
            }
1061
1062
            /*
1063
             * Because of our bonus system, parents of candidates might have scores
1064
             * themselves. They get half of the node. There won't be nodes with higher
1065
             * scores than our topCandidate, but if we see the score going *up* in the first
1066
             * few steps up the tree, that's a decent sign that there might be more content
1067
             * lurking in other places that we want to unify in. The sibling stuff
1068
             * below does some of that - but only if we've looked high enough up the DOM
1069
             * tree.
1070
             */
1071
1072
            $parentOfTopCandidate = $topCandidate->parentNode;
1073
            $lastScore = $topCandidate->contentScore;
1074
1075
            // The scores shouldn't get too low.
1076
            $scoreThreshold = $lastScore / 3;
1077
1078
            /* @var DOMElement $parentOfTopCandidate */
1079
            while ($parentOfTopCandidate->nodeName !== 'body') {
1080
                $parentScore = $parentOfTopCandidate->contentScore;
1081
                if ($parentScore < $scoreThreshold) {
1082
                    break;
1083
                }
1084
1085
                if ($parentScore > $lastScore) {
1086
                    // Alright! We found a better parent to use.
1087
                    $topCandidate = $parentOfTopCandidate;
1088
                    $this->logger->info('[Rating] Found a better top candidate.');
1089
                    break;
1090
                }
1091
                $lastScore = $parentOfTopCandidate->contentScore;
1092
                $parentOfTopCandidate = $parentOfTopCandidate->parentNode;
1093
            }
1094
1095
            // If the top candidate is the only child, use parent instead. This will help sibling
1096
            // joining logic when adjacent content is actually located in parent's sibling node.
1097
            $parentOfTopCandidate = $topCandidate->parentNode;
1098
            while ($parentOfTopCandidate->nodeName !== 'body' && count(NodeUtility::filterTextNodes($parentOfTopCandidate->childNodes)) === 1) {
1099
                $topCandidate = $parentOfTopCandidate;
1100
                $parentOfTopCandidate = $topCandidate->parentNode;
1101
            }
1102
        }
1103
1104
        /*
1105
         * Now that we have the top candidate, look through its siblings for content
1106
         * that might also be related. Things like preambles, content split by ads
1107
         * that we removed, etc.
1108
         */
1109
1110
        $this->logger->info('[Rating] Creating final article content document...');
1111
1112
        $articleContent = new DOMDocument('1.0', 'utf-8');
1113
        $articleContent->createElement('div');
1114
1115
        $siblingScoreThreshold = max(10, $topCandidate->contentScore * 0.2);
1116
        // Keep potential top candidate's parent node to try to get text direction of it later.
1117
        $parentOfTopCandidate = $topCandidate->parentNode;
1118
        $siblings = $parentOfTopCandidate->childNodes;
1119
1120
        $hasContent = false;
1121
1122
        $this->logger->info('[Rating] Adding top candidate siblings...');
1123
1124
        /* @var DOMElement $sibling */
1125
        // Can't foreach here because down there we might change the tag name and that causes the foreach to skip items
1126
        for ($i = 0; $i < $siblings->length; $i++) {
1127
            $sibling = $siblings[$i];
1128
            $append = false;
1129
1130
            if ($sibling === $topCandidate) {
1131
                $this->logger->debug('[Rating] Sibling is equal to the top candidate, adding to the final article...');
1132
1133
                $append = true;
1134
            } else {
1135
                $contentBonus = 0;
1136
1137
                // Give a bonus if sibling nodes and top candidates have the example same classname
1138
                if ($sibling->getAttribute('class') === $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') !== '') {
1139
                    $contentBonus += $topCandidate->contentScore * 0.2;
1140
                }
1141
                if ($sibling->contentScore + $contentBonus >= $siblingScoreThreshold) {
1142
                    $append = true;
1143
                } elseif ($sibling->nodeName === 'p') {
1144
                    $linkDensity = $sibling->getLinkDensity();
1145
                    $nodeContent = $sibling->getTextContent(true);
1146
1147
                    if (mb_strlen($nodeContent) > 80 && $linkDensity < 0.25) {
1148
                        $append = true;
1149
                    } elseif ($nodeContent && mb_strlen($nodeContent) < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) {
1150
                        $append = true;
1151
                    }
1152
                }
1153
            }
1154
1155
            if ($append) {
1156
                $this->logger->debug(sprintf('[Rating] Appending sibling to final article, content is: \'%s\'', substr($sibling->nodeValue, 0, 128)));
1157
1158
                $hasContent = true;
1159
1160
                if (!in_array(strtolower($sibling->nodeName), $this->alterToDIVExceptions)) {
1161
                    /*
1162
                     * We have a node that isn't a common block level element, like a form or td tag.
1163
                     * Turn it into a div so it doesn't get filtered out later by accident.
1164
                     */
1165
                    $sibling = NodeUtility::setNodeTag($sibling, 'div');
1166
                }
1167
1168
                $import = $articleContent->importNode($sibling, true);
1169
                $articleContent->appendChild($import);
1170
1171
                /*
1172
                 * No node shifting needs to be check because when calling getChildren, an array is made with the
1173
                 * children of the parent node, instead of using the DOMElement childNodes function, which, when used
1174
                 * along with appendChild, would shift the nodes position and the current foreach will behave in
1175
                 * unpredictable ways.
1176
                 */
1177
            }
1178
        }
1179
1180
        $articleContent = $this->prepArticle($articleContent);
1181
1182
        if ($hasContent) {
1183
            // Find out text direction from ancestors of final top candidate.
1184
            $ancestors = array_merge([$parentOfTopCandidate, $topCandidate], $parentOfTopCandidate->getNodeAncestors());
1185
            foreach ($ancestors as $ancestor) {
1186
                $articleDir = $ancestor->getAttribute('dir');
1187
                if ($articleDir) {
1188
                    $this->setDirection($articleDir);
1189
                    $this->logger->debug(sprintf('[Rating] Found article direction: %s', $articleDir));
1190
                    break;
1191
                }
1192
            }
1193
1194
            return $articleContent;
1195
        } else {
1196
            return false;
1197
        }
1198
    }
1199
1200
    /**
1201
     * Cleans up the final article.
1202
     *
1203
     * @param DOMDocument $article
1204
     *
1205
     * @return DOMDocument
1206
     */
1207
    public function prepArticle(DOMDocument $article)
1208
    {
1209
        $this->logger->info('[PrepArticle] Preparing final article...');
1210
1211
        $this->_cleanStyles($article);
1212
        $this->_clean($article, 'style');
1213
1214
        // Check for data tables before we continue, to avoid removing items in
1215
        // those tables, which will often be isolated even though they're
1216
        // visually linked to other content-ful elements (text, images, etc.).
1217
        $this->_markDataTables($article);
1218
1219
        // Clean out junk from the article content
1220
        $this->_cleanConditionally($article, 'form');
1221
        $this->_cleanConditionally($article, 'fieldset');
1222
        $this->_clean($article, 'object');
1223
        $this->_clean($article, 'embed');
1224
        $this->_clean($article, 'h1');
1225
        $this->_clean($article, 'footer');
1226
        $this->_clean($article, 'link');
1227
        $this->_clean($article, 'aside');
1228
1229
        // Clean out elements have "share" in their id/class combinations from final top candidates,
1230
        // which means we don't remove the top candidates even they have "share".
1231
        foreach ($article->childNodes as $child) {
1232
            $this->_cleanMatchedNodes($child, '/share/i');
1233
        }
1234
1235
        /*
1236
         * If there is only one h2 and its text content substantially equals article title,
1237
         * they are probably using it as a header and not a subheader,
1238
         * so remove it since we already extract the title separately.
1239
         */
1240
        $h2 = $article->getElementsByTagName('h2');
1241
        if ($h2->length === 1) {
1242
            $lengthSimilarRate = (mb_strlen($h2->item(0)->textContent) - mb_strlen($this->getTitle())) / max(mb_strlen($this->getTitle()), 1);
1243
1244
            if (abs($lengthSimilarRate) < 0.5) {
1245
                if ($lengthSimilarRate > 0) {
1246
                    $titlesMatch = strpos($h2->item(0)->textContent, $this->getTitle()) !== false;
1247
                } else {
1248
                    $titlesMatch = strpos($this->getTitle(), $h2->item(0)->textContent) !== false;
1249
                }
1250
                if ($titlesMatch) {
1251
                    $this->logger->info('[PrepArticle] Found title repeated in an H2 node, removing...');
1252
                    $this->_clean($article, 'h2');
1253
                }
1254
            }
1255
        }
1256
1257
        $this->_clean($article, 'iframe');
1258
        $this->_clean($article, 'input');
1259
        $this->_clean($article, 'textarea');
1260
        $this->_clean($article, 'select');
1261
        $this->_clean($article, 'button');
1262
        $this->_cleanHeaders($article);
1263
1264
        // Do these last as the previous stuff may have removed junk
1265
        // that will affect these
1266
        $this->_cleanConditionally($article, 'table');
1267
        $this->_cleanConditionally($article, 'ul');
1268
        $this->_cleanConditionally($article, 'div');
1269
1270
        $this->_cleanExtraParagraphs($article);
1271
1272
        foreach (iterator_to_array($article->getElementsByTagName('br')) as $br) {
1273
            $next = $br->nextSibling;
1274
            if ($next && $next->nodeName === 'p') {
1275
                $this->logger->debug('[PrepArticle] Removing br node next to a p node.');
1276
                $br->parentNode->removeChild($br);
1277
            }
1278
        }
1279
1280
        // Remove single-cell tables
1281
        foreach ($article->shiftingAwareGetElementsByTagName('table') as $table) {
1282
            /** @var DOMNode $table */
1283
            $tbody = $table->hasSingleTagInsideElement('tbody') ? $table->getFirstElementChild() : $table;
1284
            if ($tbody->hasSingleTagInsideElement('tr')) {
0 ignored issues
show
introduced by
The method hasSingleTagInsideElement() does not exist on DOMElement. Are you sure you never get this type here, but always one of the subclasses? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1284
            if ($tbody->/** @scrutinizer ignore-call */ hasSingleTagInsideElement('tr')) {
Loading history...
Bug introduced by
The method hasSingleTagInsideElement() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1284
            if ($tbody->/** @scrutinizer ignore-call */ hasSingleTagInsideElement('tr')) {

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1285
                $row = $tbody->getFirstElementChild();
0 ignored issues
show
introduced by
The method getFirstElementChild() does not exist on DOMElement. Are you sure you never get this type here, but always one of the subclasses? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1285
                /** @scrutinizer ignore-call */ 
1286
                $row = $tbody->getFirstElementChild();
Loading history...
1286
                if ($row->hasSingleTagInsideElement('td')) {
1287
                    $cell = $row->getFirstElementChild();
1288
                    $cell = NodeUtility::setNodeTag($cell, (array_reduce(iterator_to_array($cell->childNodes), function($carry, $node) {
1289
                        return $node->isPhrasingContent() && $carry;
1290
                    }, true)) ? 'p' : 'div');
1291
                    $table->parentNode->replaceChild($cell, $table);
1292
                }
1293
            }
1294
        }
1295
1296
        return $article;
1297
    }
1298
1299
    /**
1300
     * Look for 'data' (as opposed to 'layout') tables, for which we use
1301
     * similar checks as
1302
     * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920.
1303
     *
1304
     * @param DOMDocument $article
1305
     *
1306
     * @return void
1307
     */
1308
    public function _markDataTables(DOMDocument $article)
1309
    {
1310
        $tables = $article->getElementsByTagName('table');
1311
        foreach ($tables as $table) {
1312
            /** @var DOMElement $table */
1313
            $role = $table->getAttribute('role');
1314
            if ($role === 'presentation') {
1315
                $table->setReadabilityDataTable(false);
1316
                continue;
1317
            }
1318
            $datatable = $table->getAttribute('datatable');
1319
            if ($datatable == '0') {
1320
                $table->setReadabilityDataTable(false);
1321
                continue;
1322
            }
1323
            $summary = $table->getAttribute('summary');
1324
            if ($summary) {
1325
                $table->setReadabilityDataTable(true);
1326
                continue;
1327
            }
1328
1329
            $caption = $table->getElementsByTagName('caption');
1330
            if ($caption->length > 0 && $caption->item(0)->childNodes->length > 0) {
1331
                $table->setReadabilityDataTable(true);
1332
                continue;
1333
            }
1334
1335
            // If the table has a descendant with any of these tags, consider a data table:
1336
            foreach (['col', 'colgroup', 'tfoot', 'thead', 'th'] as $dataTableDescendants) {
1337
                if ($table->getElementsByTagName($dataTableDescendants)->length > 0) {
1338
                    $table->setReadabilityDataTable(true);
1339
                    continue 2;
1340
                }
1341
            }
1342
1343
            // Nested tables indicate a layout table:
1344
            if ($table->getElementsByTagName('table')->length > 0) {
1345
                $table->setReadabilityDataTable(false);
1346
                continue;
1347
            }
1348
1349
            $sizeInfo = $table->getRowAndColumnCount();
1350
            if ($sizeInfo['rows'] >= 10 || $sizeInfo['columns'] > 4) {
1351
                $table->setReadabilityDataTable(true);
1352
                continue;
1353
            }
1354
            // Now just go by size entirely:
1355
            $table->setReadabilityDataTable($sizeInfo['rows'] * $sizeInfo['columns'] > 10);
1356
        }
1357
    }
1358
1359
    /**
1360
     * Remove the style attribute on every e and under.
1361
     *
1362
     * @param $node DOMDocument|DOMNode
1363
     **/
1364
    public function _cleanStyles($node)
1365
    {
1366
        if (property_exists($node, 'tagName') && $node->tagName === 'svg') {
1367
            return;
1368
        }
1369
1370
        // Do not bother if there's no method to remove an attribute
1371
        if (method_exists($node, 'removeAttribute')) {
1372
            $presentational_attributes = ['align', 'background', 'bgcolor', 'border', 'cellpadding', 'cellspacing', 'frame', 'hspace', 'rules', 'style', 'valign', 'vspace'];
1373
            // Remove `style` and deprecated presentational attributes
1374
            foreach ($presentational_attributes as $presentational_attribute) {
1375
                $node->removeAttribute($presentational_attribute);
1376
            }
1377
1378
            $deprecated_size_attribute_elems = ['table', 'th', 'td', 'hr', 'pre'];
1379
            if (property_exists($node, 'tagName') && in_array($node->tagName, $deprecated_size_attribute_elems)) {
1380
                $node->removeAttribute('width');
1381
                $node->removeAttribute('height');
1382
            }
1383
        }
1384
1385
        $cur = $node->firstChild;
1386
        while ($cur !== null) {
1387
            $this->_cleanStyles($cur);
1388
            $cur = $cur->nextSibling;
1389
        }
1390
    }
1391
1392
    /**
1393
     * Clean out elements whose id/class combinations match specific string.
1394
     *
1395
     * @param $node DOMElement Node to clean
1396
     * @param $regex string Match id/class combination.
1397
     *
1398
     * @return void
1399
     **/
1400
    public function _cleanMatchedNodes($node, $regex)
1401
    {
1402
        $endOfSearchMarkerNode = NodeUtility::getNextNode($node, true);
1403
        $next = NodeUtility::getNextNode($node);
1404
        while ($next && $next !== $endOfSearchMarkerNode) {
1405
            if (preg_match($regex, sprintf('%s %s', $next->getAttribute('class'), $next->getAttribute('id')))) {
1406
                $this->logger->debug(sprintf('Removing matched node with regex: \'%s\', node class was: \'%s\', id: \'%s\'', $regex, $next->getAttribute('class'), $next->getAttribute('id')));
1407
                $next = NodeUtility::removeAndGetNext($next);
1408
            } else {
1409
                $next = NodeUtility::getNextNode($next);
1410
            }
1411
        }
1412
    }
1413
1414
    /**
1415
     * @param DOMDocument $article
1416
     *
1417
     * @return void
1418
     */
1419
    public function _cleanExtraParagraphs(DOMDocument $article)
1420
    {
1421
        $paragraphs = $article->getElementsByTagName('p');
1422
        $length = $paragraphs->length;
1423
1424
        for ($i = 0; $i < $length; $i++) {
1425
            $paragraph = $paragraphs->item($length - 1 - $i);
1426
1427
            $imgCount = $paragraph->getElementsByTagName('img')->length;
1428
            $embedCount = $paragraph->getElementsByTagName('embed')->length;
1429
            $objectCount = $paragraph->getElementsByTagName('object')->length;
1430
            // At this point, nasty iframes have been removed, only remain embedded video ones.
1431
            $iframeCount = $paragraph->getElementsByTagName('iframe')->length;
1432
            $totalCount = $imgCount + $embedCount + $objectCount + $iframeCount;
1433
1434
            if ($totalCount === 0 && !preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $paragraph->textContent)) {
1435
                $this->logger->debug(sprintf('[PrepArticle] Removing extra paragraph. Text content was: \'%s\'', substr($paragraph->textContent, 0, 128)));
1436
                $paragraph->parentNode->removeChild($paragraph);
1437
            }
1438
        }
1439
    }
1440
1441
    /**
1442
     * @param DOMDocument $article
1443
     * @param string $tag Tag to clean conditionally
1444
     *
1445
     * @return void
1446
     */
1447
    public function _cleanConditionally(DOMDocument $article, $tag)
1448
    {
1449
        if (!$this->configuration->getCleanConditionally()) {
1450
            return;
1451
        }
1452
1453
        $isList = in_array($tag, ['ul', 'ol']);
1454
1455
        /*
1456
         * Gather counts for other typical elements embedded within.
1457
         * Traverse backwards so we can remove nodes at the same time
1458
         * without effecting the traversal.
1459
         */
1460
1461
        $DOMNodeList = $article->getElementsByTagName($tag);
1462
        $length = $DOMNodeList->length;
1463
        for ($i = 0; $i < $length; $i++) {
1464
            /** @var $node DOMElement */
1465
            $node = $DOMNodeList->item($length - 1 - $i);
1466
1467
            // First check if we're in a data table, in which case don't remove us.
1468
            if ($node->hasAncestorTag('table', -1, function($node) {
1469
                return $node->isReadabilityDataTable();
1470
            })) {
1471
                continue;
1472
            }
1473
1474
            $weight = 0;
1475
            if ($this->configuration->getWeightClasses()) {
1476
                $weight = $node->getClassWeight();
1477
            }
1478
1479
            if ($weight < 0) {
1480
                $this->logger->debug(sprintf('[PrepArticle] Removing tag \'%s\' with 0 or less weight', $tag));
1481
1482
                NodeUtility::removeNode($node);
1483
                continue;
1484
            }
1485
1486
            if (substr_count($node->getTextContent(), ',') < 10) {
1487
                /*
1488
                 * If there are not very many commas, and the number of
1489
                 * non-paragraph elements is more than paragraphs or other
1490
                 * ominous signs, remove the element.
1491
                 */
1492
1493
                $p = $node->getElementsByTagName('p')->length;
1494
                $img = $node->getElementsByTagName('img')->length;
1495
                $li = $node->getElementsByTagName('li')->length - 100;
1496
                $input = $node->getElementsByTagName('input')->length;
1497
1498
                $embedCount = 0;
1499
                $embeds = $node->getElementsByTagName('embed');
1500
1501
                foreach ($embeds as $embedNode) {
1502
                    if (preg_match(NodeUtility::$regexps['videos'], $embedNode->C14N())) {
1503
                        $embedCount++;
1504
                    }
1505
                }
1506
1507
                $linkDensity = $node->getLinkDensity();
1508
                $contentLength = mb_strlen($node->getTextContent(true));
1509
1510
                $haveToRemove =
1511
                    ($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag('figure')) ||
1512
                    (!$isList && $li > $p) ||
1513
                    ($input > floor($p / 3)) ||
1514
                    (!$isList && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag('figure')) ||
1515
                    (!$isList && $weight < 25 && $linkDensity > 0.2) ||
1516
                    ($weight >= 25 && $linkDensity > 0.5) ||
1517
                    (($embedCount === 1 && $contentLength < 75) || $embedCount > 1);
1518
1519
                if ($haveToRemove) {
1520
                    $this->logger->debug(sprintf('[PrepArticle] Removing tag \'%s\'.', $tag));
1521
1522
                    NodeUtility::removeNode($node);
1523
                }
1524
            }
1525
        }
1526
    }
1527
1528
    /**
1529
     * Clean a node of all elements of type "tag".
1530
     * (Unless it's a youtube/vimeo video. People love movies.).
1531
     *
1532
     * @param $article DOMDocument
1533
     * @param $tag string tag to clean
1534
     *
1535
     * @return void
1536
     **/
1537
    public function _clean(DOMDocument $article, $tag)
1538
    {
1539
        $isEmbed = in_array($tag, ['object', 'embed', 'iframe']);
1540
1541
        $DOMNodeList = $article->getElementsByTagName($tag);
1542
        $length = $DOMNodeList->length;
1543
        for ($i = 0; $i < $length; $i++) {
1544
            $item = $DOMNodeList->item($length - 1 - $i);
1545
1546
            // Allow youtube and vimeo videos through as people usually want to see those.
1547
            if ($isEmbed) {
1548
                $attributeValues = [];
1549
                foreach ($item->attributes as $value) {
1550
                    $attributeValues[] = $value->nodeValue;
1551
                }
1552
                $attributeValues = implode('|', $attributeValues);
1553
1554
                // First, check the elements attributes to see if any of them contain youtube or vimeo
1555
                if (preg_match(NodeUtility::$regexps['videos'], $attributeValues)) {
1556
                    continue;
1557
                }
1558
1559
                // Then check the elements inside this element for the same.
1560
                if (preg_match(NodeUtility::$regexps['videos'], $item->C14N())) {
1561
                    continue;
1562
                }
1563
            }
1564
            $this->logger->debug(sprintf('[PrepArticle] Removing node \'%s\'.', $item->tagName));
1565
1566
            NodeUtility::removeNode($item);
1567
        }
1568
    }
1569
1570
    /**
1571
     * Clean out spurious headers from an Element. Checks things like classnames and link density.
1572
     *
1573
     * @param DOMDocument $article
1574
     *
1575
     * @return void
1576
     **/
1577
    public function _cleanHeaders(DOMDocument $article)
1578
    {
1579
        for ($headerIndex = 1; $headerIndex < 3; $headerIndex++) {
1580
            $headers = $article->getElementsByTagName('h'.$headerIndex);
1581
            /** @var $header DOMElement */
1582
            foreach ($headers as $header) {
1583
                $weight = 0;
1584
                if ($this->configuration->getWeightClasses()) {
1585
                    $weight = $header->getClassWeight();
1586
                }
1587
1588
                if ($weight < 0) {
1589
                    $this->logger->debug(sprintf('[PrepArticle] Removing H node with 0 or less weight. Content was: \'%s\'', substr($header->nodeValue, 0, 128)));
1590
1591
                    NodeUtility::removeNode($header);
1592
                }
1593
            }
1594
        }
1595
    }
1596
1597
    /**
1598
     * Removes the class="" attribute from every element in the given
1599
     * subtree.
1600
     *
1601
     * Readability.js has a special filter to avoid cleaning the classes that the algorithm adds. We don't add classes
1602
     * here so no need to filter those.
1603
     *
1604
     * @param DOMDocument|DOMNode $node
1605
     *
1606
     * @return void
1607
     **/
1608
    public function _cleanClasses($node)
1609
    {
1610
        if ($node->getAttribute('class') !== '') {
1611
            $node->removeAttribute('class');
1612
        }
1613
1614
        for ($node = $node->getFirstElementChild(); $node !== null; $node = $node->nextSibling) {
1615
            $this->_cleanClasses($node);
0 ignored issues
show
Bug introduced by
$node of type DOMElement is incompatible with the type andreskrey\Readability\N...ility\Nodes\DOM\DOMNode expected by parameter $node of andreskrey\Readability\R...bility::_cleanClasses(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1615
            $this->_cleanClasses(/** @scrutinizer ignore-type */ $node);
Loading history...
1616
        }
1617
    }
1618
1619
    /**
1620
     * @param DOMDocument $article
1621
     *
1622
     * @return DOMDocument
1623
     */
1624
    public function postProcessContent(DOMDocument $article)
1625
    {
1626
        $this->logger->info('[PostProcess] PostProcessing content...');
1627
1628
        // Readability cannot open relative uris so we convert them to absolute uris.
1629
        if ($this->configuration->getFixRelativeURLs()) {
1630
            foreach (iterator_to_array($article->getElementsByTagName('a')) as $link) {
1631
                /** @var DOMElement $link */
1632
                $href = $link->getAttribute('href');
1633
                if ($href) {
1634
                    // Replace links with javascript: URIs with text content, since
1635
                    // they won't work after scripts have been removed from the page.
1636
                    if (strpos($href, 'javascript:') === 0) {
1637
                        $this->logger->debug(sprintf('[PostProcess] Removing \'javascript:\' link. Content is: \'%s\'', substr($link->textContent, 0, 128)));
1638
1639
                        $text = $article->createTextNode($link->textContent);
1640
                        $link->parentNode->replaceChild($text, $link);
1641
                    } else {
1642
                        $this->logger->debug(sprintf('[PostProcess] Converting link to absolute URI: \'%s\'', substr($href, 0, 128)));
1643
1644
                        $link->setAttribute('href', $this->toAbsoluteURI($href));
1645
                    }
1646
                }
1647
            }
1648
1649
            foreach ($article->getElementsByTagName('img') as $img) {
1650
                /** @var DOMElement $img */
1651
                /*
1652
                 * Extract all possible sources of img url and select the first one on the list.
1653
                 */
1654
                $url = [
1655
                    $img->getAttribute('src'),
1656
                    $img->getAttribute('data-src'),
1657
                    $img->getAttribute('data-original'),
1658
                    $img->getAttribute('data-orig'),
1659
                    $img->getAttribute('data-url')
1660
                ];
1661
1662
                $src = array_filter($url);
1663
                $src = reset($src);
1664
                if ($src) {
1665
                    $this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($src, 0, 128)));
1666
1667
                    $img->setAttribute('src', $this->toAbsoluteURI($src));
1668
                }
1669
            }
1670
        }
1671
1672
        $this->_cleanClasses($article);
1673
1674
        return $article;
1675
    }
1676
1677
    /**
1678
     * @return null|string
1679
     */
1680
    public function __toString()
1681
    {
1682
        return sprintf('<h1>%s</h1>%s', $this->getTitle(), $this->getContent());
1683
    }
1684
1685
    /**
1686
     * @return string|null
1687
     */
1688
    public function getTitle()
1689
    {
1690
        return $this->title;
1691
    }
1692
1693
    /**
1694
     * @param string $title
1695
     */
1696
    protected function setTitle($title)
1697
    {
1698
        $this->title = $title;
1699
    }
1700
1701
    /**
1702
     * @return string|null
1703
     */
1704
    public function getContent()
1705
    {
1706
        return ($this->content instanceof DOMDocument) ? $this->content->C14N() : null;
1707
    }
1708
1709
    /**
1710
     * @return DOMDocument|null
1711
     */
1712
    public function getDOMDocument()
1713
    {
1714
        return $this->content;
1715
    }
1716
1717
    /**
1718
     * @param DOMDocument $content
1719
     */
1720
    protected function setContent(DOMDocument $content)
1721
    {
1722
        $this->content = $content;
1723
    }
1724
1725
    /**
1726
     * @return null|string
1727
     */
1728
    public function getExcerpt()
1729
    {
1730
        return $this->excerpt;
1731
    }
1732
1733
    /**
1734
     * @param null|string $excerpt
1735
     */
1736
    public function setExcerpt($excerpt)
1737
    {
1738
        $this->excerpt = $excerpt;
1739
    }
1740
1741
    /**
1742
     * @return string|null
1743
     */
1744
    public function getImage()
1745
    {
1746
        return $this->image;
1747
    }
1748
1749
    /**
1750
     * @param string $image
1751
     */
1752
    protected function setImage($image)
1753
    {
1754
        $this->image = $image;
1755
    }
1756
1757
    /**
1758
     * @return string|null
1759
     */
1760
    public function getAuthor()
1761
    {
1762
        return $this->author;
1763
    }
1764
1765
    /**
1766
     * @param string $author
1767
     */
1768
    protected function setAuthor($author)
1769
    {
1770
        $this->author = $author;
1771
    }
1772
1773
    /**
1774
     * @return string|null
1775
     */
1776
    public function getSiteName()
1777
    {
1778
        return $this->siteName;
1779
    }
1780
1781
    /**
1782
     * @param string $siteName
1783
     */
1784
    protected function setSiteName($siteName)
1785
    {
1786
        $this->siteName = $siteName;
1787
    }
1788
1789
    /**
1790
     * @return null|string
1791
     */
1792
    public function getDirection()
1793
    {
1794
        return $this->direction;
1795
    }
1796
1797
    /**
1798
     * @param null|string $direction
1799
     */
1800
    public function setDirection($direction)
1801
    {
1802
        $this->direction = $direction;
1803
    }
1804
}
1805