Scraper::scrape() - Code Metrics - pilipinews/inquirer - Measure and Improve Code Quality continuously with Scrutinizer

Scraper::scrape() A
last analyzed 2020-07-12 03:39 UTC

↳ Parent: Scraper

Complexity

Conditions	1
Paths	1

Size

Total Lines	31
Code Lines	15

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	16
CRAP Score	1

Importance

Changes	6
Bugs	0	Features	0

Metric	Value
cc	1
eloc	15
c	6
b	0
f	0
nc	1
nop	1
dl	0
loc	31
ccs	16
cts	16
cp	1
crap	1
rs	9.7666

<?php

namespace Pilipinews\Website\Inquirer;

use Pilipinews\Common\Article;
use Pilipinews\Common\Client;
use Pilipinews\Common\Crawler as DomCrawler;
use Pilipinews\Common\Interfaces\ScraperInterface;
use Pilipinews\Common\Scraper as AbstractScraper;

/**
 * Inquirer News Scraper
 *
 * @package Pilipinews
 * @author  Rougin Gutib <[email protected]>
 */
class Scraper extends AbstractScraper implements ScraperInterface
{
    const TEXT_FOOTER = 'Subscribe to INQUIRER PLUS (https://www.inquirer.net/plus) to get access to The Philippine Daily Inquirer & other 70+ titles, share up to 5 gadgets, listen to the news, download as early as 4am & share articles on social media. Call 896 6000.';

    const TEXT_FOOTER2 = 'Subscribe to INQUIRER PLUS (https://www.inquirer.net/plus) to get access to The Philippine Daily Inquirer &amp; other 70+ titles, share up to 5 gadgets, listen to the news, download as early as 4am &amp; share articles on social media. Call 896 6000.';

    const TEXT_FOOTER3 = "\n\n" . 'For more news about the novel coronavirus click here. (https://www.inquirer.net/novel-coronavirus-update)';

    const TEXT_FOOTER4 = "\n\n" . 'What you need to know about Coronavirus. (https://newsinfo.inquirer.net/1243479/coronavirus-pandemic-2020-everything-you-need-to-know)';

    /**
     * @var string[]
     */
    protected $refresh = array('Refresh this page for updates.');

    /**
     * @var string[]
     */
    protected $removables = array(
        '#ms-slider-wrap',
        '#mr-2018-wrap',
        'script',
        '#billboard_article',
        '.ventuno-vid',
        '#article_disclaimer',
        '.OUTBRAIN',
        '#ch-follow-us',
        '.view-comments',
        '#article_tags',
        '.adsbygoogle',
        '#article-new-featured',
        '#read-next-2018',
        '#rn-lbl',
        '#fb-root',
        '#lsmr-lbl',
        '#lsmr-box',
        '.bb_iawr',
        'style',
        '.ob_amplifypixel',
        '.lazyload',
    );

    /**
     * Returns the contents of an article.
     *
     * @param  string $link
     * @return \Pilipinews\Common\Article
     */
    public function scrape($link)
    {
        $this->prepare(mb_strtolower($link));

        $title = $this->title('.entry-title');

        $pattern = '/-(\d+)x(\d+).jpg/i';

        $this->remove((array) $this->removables);

        $body = $this->body('#article_content');

        $body = $this->caption($body);

        $body = $this->fbvideo($body);

        $body = $this->fbpost($body)->html();

        $body = preg_replace($pattern, '.jpg', $body);

        $body = $this->html(new DomCrawler($body), $this->refresh);

        $body = str_replace(self::TEXT_FOOTER3, '', trim($body));

        $body = str_replace(self::TEXT_FOOTER, '', trim($body));

        $body = str_replace(self::TEXT_FOOTER4, '', trim($body));

        $body = str_replace(self::TEXT_FOOTER2, '', trim($body));

        return new Article($title, trim($body), (string) $link);
    }

    /**
     * Converts caption elements to readable string.
     *
     * @param  \Pilipinews\Common\Crawler $crawler
     * @return \Pilipinews\Common\Crawler
     */
    protected function caption(DomCrawler $crawler)
    {
        $callback = function (DomCrawler $crawler)
        {
            $image = $crawler->filter('img')->first()->attr('src');

            $format = (string) '<p>PHOTO: %s - %s</p>';

            $text = $crawler->filter('.wp-caption-text')->first();

            return sprintf($format, $image, $text->html());
        };

        return $this->replace($crawler, '.wp-caption', $callback);
    }

    /**
     * Converts Facebook embedded posts to readable string.
     *
     * @param  \Pilipinews\Common\Crawler $crawler
     * @return \Pilipinews\Common\Crawler
     */
    protected function fbpost(DomCrawler $crawler)
    {
        $callback = function (DomCrawler $crawler)
        {
            $link = $crawler->attr('cite');


            $text = '<p>POST: ' . $crawler->attr('cite') . '</p>';

            $message = $crawler->filter('p > a')->first();

            return $text . '<p>' . $message->text() . '</p>';
        };

        return $this->replace($crawler, '.fb-xfbml-parse-ignore', $callback);
    }

    /**
     * Converts fbvideo elements to readable string.
     *
     * @param  \Pilipinews\Common\Crawler $crawler
     * @return \Pilipinews\Common\Crawler
     */
    protected function fbvideo(DomCrawler $crawler)
    {
        $callback = function (DomCrawler $crawler)
        {
            $link = $crawler->attr('data-href');

            return '<p>VIDEO: ' . $link . '</p>';
        };

        return $this->replace($crawler, '.fb-video', $callback);
    }

    /**
     * Initializes the crawler instance.
     *
     * @param  string $link
     * @return void
     */
    protected function prepare($link)
    {
        $response = Client::request((string) $link);

        $response = str_replace('<p>Click <a href="https://www.inquirer.net/philippine-typhoon-news">here</a> for more weather related news."</p>', '', $response);

        $response = str_replace('<p>Click <a href="https://www.inquirer.net/philippine-typhoon-news">here</a> for more weather related news.</p>', '', $response);

        $response = str_replace('<strong> </strong>', ' ', $response);

        $this->crawler = new DomCrawler($response);
    }
}


1		<?php
2
3		namespace Pilipinews\Website\Inquirer;
4
5		use Pilipinews\Common\Article;
6		use Pilipinews\Common\Client;
7		use Pilipinews\Common\Crawler as DomCrawler;
8		use Pilipinews\Common\Interfaces\ScraperInterface;
9		use Pilipinews\Common\Scraper as AbstractScraper;
10
11		/**
12		* Inquirer News Scraper
13		*
14		* @package Pilipinews
15		* @author Rougin Gutib <[email protected]>
16		*/
17		class Scraper extends AbstractScraper implements ScraperInterface
18		{
19		const TEXT_FOOTER = 'Subscribe to INQUIRER PLUS (https://www.inquirer.net/plus) to get access to The Philippine Daily Inquirer & other 70+ titles, share up to 5 gadgets, listen to the news, download as early as 4am & share articles on social media. Call 896 6000.';
20
21		const TEXT_FOOTER2 = 'Subscribe to INQUIRER PLUS (https://www.inquirer.net/plus) to get access to The Philippine Daily Inquirer & other 70+ titles, share up to 5 gadgets, listen to the news, download as early as 4am & share articles on social media. Call 896 6000.';
22
23		const TEXT_FOOTER3 = "\n\n" . 'For more news about the novel coronavirus click here. (https://www.inquirer.net/novel-coronavirus-update)';
24
25		const TEXT_FOOTER4 = "\n\n" . 'What you need to know about Coronavirus. (https://newsinfo.inquirer.net/1243479/coronavirus-pandemic-2020-everything-you-need-to-know)';
26
27		/**
28		* @var string[]
29		*/
30		protected $refresh = array('Refresh this page for updates.');
31
32		/**
33		* @var string[]
34		*/
35		protected $removables = array(
36		'#ms-slider-wrap',
37		'#mr-2018-wrap',
38		'script',
39		'#billboard_article',
40		'.ventuno-vid',
41		'#article_disclaimer',
42		'.OUTBRAIN',
43		'#ch-follow-us',
44		'.view-comments',
45		'#article_tags',
46		'.adsbygoogle',
47		'#article-new-featured',
48		'#read-next-2018',
49		'#rn-lbl',
50		'#fb-root',
51		'#lsmr-lbl',
52		'#lsmr-box',
53		'.bb_iawr',
54		'style',
55		'.ob_amplifypixel',
56		'.lazyload',
57		);
58
59		/**
60		* Returns the contents of an article.
61		*
62		* @param string $link
63		* @return \Pilipinews\Common\Article
64		*/
65	42	public function scrape($link)
66		{
67	42	$this->prepare(mb_strtolower($link));
68
69	42	$title = $this->title('.entry-title');
70
71	42	$pattern = '/-(\d+)x(\d+).jpg/i';
72
73	42	$this->remove((array) $this->removables);
74
75	42	$body = $this->body('#article_content');
76
77	42	$body = $this->caption($body);
78
79	42	$body = $this->fbvideo($body);
80
81	42	$body = $this->fbpost($body)->html();
82
83	42	$body = preg_replace($pattern, '.jpg', $body);
84
85	42	$body = $this->html(new DomCrawler($body), $this->refresh);
86
87	42	$body = str_replace(self::TEXT_FOOTER3, '', trim($body));
88
89	42	$body = str_replace(self::TEXT_FOOTER, '', trim($body));
90
91	42	$body = str_replace(self::TEXT_FOOTER4, '', trim($body));
92
93	42	$body = str_replace(self::TEXT_FOOTER2, '', trim($body));
94
95	42	return new Article($title, trim($body), (string) $link);
96		}
97
98		/**
99		* Converts caption elements to readable string.
100		*
101		* @param \Pilipinews\Common\Crawler $crawler
102		* @return \Pilipinews\Common\Crawler
103		*/
104	14	protected function caption(DomCrawler $crawler)
105		{
106	28	$callback = function (DomCrawler $crawler)
107		{
108	21	$image = $crawler->filter('img')->first()->attr('src');
109
110	21	$format = (string) '<p>PHOTO: %s - %s</p>';
111
112	21	$text = $crawler->filter('.wp-caption-text')->first();
113
114	21	return sprintf($format, $image, $text->html());
115	42	};
116
117	42	return $this->replace($crawler, '.wp-caption', $callback);
118		}
119
120		/**
121		* Converts Facebook embedded posts to readable string.
122		*
123		* @param \Pilipinews\Common\Crawler $crawler
124		* @return \Pilipinews\Common\Crawler
125		*/
126	14	protected function fbpost(DomCrawler $crawler)
127		{
128	28	$callback = function (DomCrawler $crawler)
129		{
130	3	$link = $crawler->attr('cite');
		0 ignored issues – show Unused Code introduced 2018-11-23 16:24 UTC by Report Bug Copy Issue Report The assignment to `$link` is dead and can be removed. Loading history...
131
132	3	$text = '<p>POST: ' . $crawler->attr('cite') . '</p>';
133
134	4	$message = $crawler->filter('p > a')->first();
135
136	3	return $text . '<p>' . $message->text() . '</p>';
137	42	};
138
139	42	return $this->replace($crawler, '.fb-xfbml-parse-ignore', $callback);
140		}
141
142		/**
143		* Converts fbvideo elements to readable string.
144		*
145		* @param \Pilipinews\Common\Crawler $crawler
146		* @return \Pilipinews\Common\Crawler
147		*/
148		protected function fbvideo(DomCrawler $crawler)
149		{
150	42	$callback = function (DomCrawler $crawler)
151		{
152	3	$link = $crawler->attr('data-href');
153
154	3	return '<p>VIDEO: ' . $link . '</p>';
155	42	};
156
157	42	return $this->replace($crawler, '.fb-video', $callback);
158		}
159
160		/**
161		* Initializes the crawler instance.
162		*
163		* @param string $link
164		* @return void
165		*/
166	42	protected function prepare($link)
167		{
168	42	$response = Client::request((string) $link);
169
170	42	$response = str_replace('<p>Click <a href="https://www.inquirer.net/philippine-typhoon-news">here</a> for more weather related news."</p>', '', $response);
171
172	42	$response = str_replace('<p>Click <a href="https://www.inquirer.net/philippine-typhoon-news">here</a> for more weather related news.</p>', '', $response);
173
174	42	$response = str_replace('<strong> </strong>', ' ', $response);
175
176	42	$this->crawler = new DomCrawler($response);
177	42	}
178		}
179

pilipinews / inquirer

Scraper::scrape() A last analyzed 2020-07-12 03:39 UTC

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like

Scraper::scrape() A
last analyzed 2020-07-12 03:39 UTC