Scraper::scrape()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 31
Code Lines 15

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 16
CRAP Score 1

Importance

Changes 6
Bugs 0 Features 0
Metric Value
cc 1
eloc 15
c 6
b 0
f 0
nc 1
nop 1
dl 0
loc 31
ccs 16
cts 16
cp 1
crap 1
rs 9.7666
1
<?php
2
3
namespace Pilipinews\Website\Inquirer;
4
5
use Pilipinews\Common\Article;
6
use Pilipinews\Common\Client;
7
use Pilipinews\Common\Crawler as DomCrawler;
8
use Pilipinews\Common\Interfaces\ScraperInterface;
9
use Pilipinews\Common\Scraper as AbstractScraper;
10
11
/**
12
 * Inquirer News Scraper
13
 *
14
 * @package Pilipinews
15
 * @author  Rougin Gutib <[email protected]>
16
 */
17
class Scraper extends AbstractScraper implements ScraperInterface
18
{
19
    const TEXT_FOOTER = 'Subscribe to INQUIRER PLUS (https://www.inquirer.net/plus) to get access to The Philippine Daily Inquirer & other 70+ titles, share up to 5 gadgets, listen to the news, download as early as 4am & share articles on social media. Call 896 6000.';
20
21
    const TEXT_FOOTER2 = 'Subscribe to INQUIRER PLUS (https://www.inquirer.net/plus) to get access to The Philippine Daily Inquirer &amp; other 70+ titles, share up to 5 gadgets, listen to the news, download as early as 4am &amp; share articles on social media. Call 896 6000.';
22
23
    const TEXT_FOOTER3 = "\n\n" . 'For more news about the novel coronavirus click here. (https://www.inquirer.net/novel-coronavirus-update)';
24
25
    const TEXT_FOOTER4 = "\n\n" . 'What you need to know about Coronavirus. (https://newsinfo.inquirer.net/1243479/coronavirus-pandemic-2020-everything-you-need-to-know)';
26
27
    /**
28
     * @var string[]
29
     */
30
    protected $refresh = array('Refresh this page for updates.');
31
32
    /**
33
     * @var string[]
34
     */
35
    protected $removables = array(
36
        '#ms-slider-wrap',
37
        '#mr-2018-wrap',
38
        'script',
39
        '#billboard_article',
40
        '.ventuno-vid',
41
        '#article_disclaimer',
42
        '.OUTBRAIN',
43
        '#ch-follow-us',
44
        '.view-comments',
45
        '#article_tags',
46
        '.adsbygoogle',
47
        '#article-new-featured',
48
        '#read-next-2018',
49
        '#rn-lbl',
50
        '#fb-root',
51
        '#lsmr-lbl',
52
        '#lsmr-box',
53
        '.bb_iawr',
54
        'style',
55
        '.ob_amplifypixel',
56
        '.lazyload',
57
    );
58
59
    /**
60
     * Returns the contents of an article.
61
     *
62
     * @param  string $link
63
     * @return \Pilipinews\Common\Article
64
     */
65 42
    public function scrape($link)
66
    {
67 42
        $this->prepare(mb_strtolower($link));
68
69 42
        $title = $this->title('.entry-title');
70
71 42
        $pattern = '/-(\d+)x(\d+).jpg/i';
72
73 42
        $this->remove((array) $this->removables);
74
75 42
        $body = $this->body('#article_content');
76
77 42
        $body = $this->caption($body);
78
79 42
        $body = $this->fbvideo($body);
80
81 42
        $body = $this->fbpost($body)->html();
82
83 42
        $body = preg_replace($pattern, '.jpg', $body);
84
85 42
        $body = $this->html(new DomCrawler($body), $this->refresh);
86
87 42
        $body = str_replace(self::TEXT_FOOTER3, '', trim($body));
88
89 42
        $body = str_replace(self::TEXT_FOOTER, '', trim($body));
90
91 42
        $body = str_replace(self::TEXT_FOOTER4, '', trim($body));
92
93 42
        $body = str_replace(self::TEXT_FOOTER2, '', trim($body));
94
95 42
        return new Article($title, trim($body), (string) $link);
96
    }
97
98
    /**
99
     * Converts caption elements to readable string.
100
     *
101
     * @param  \Pilipinews\Common\Crawler $crawler
102
     * @return \Pilipinews\Common\Crawler
103
     */
104 14
    protected function caption(DomCrawler $crawler)
105
    {
106 28
        $callback = function (DomCrawler $crawler)
107
        {
108 21
            $image = $crawler->filter('img')->first()->attr('src');
109
110 21
            $format = (string) '<p>PHOTO: %s - %s</p>';
111
112 21
            $text = $crawler->filter('.wp-caption-text')->first();
113
114 21
            return sprintf($format, $image, $text->html());
115 42
        };
116
117 42
        return $this->replace($crawler, '.wp-caption', $callback);
118
    }
119
120
    /**
121
     * Converts Facebook embedded posts to readable string.
122
     *
123
     * @param  \Pilipinews\Common\Crawler $crawler
124
     * @return \Pilipinews\Common\Crawler
125
     */
126 14
    protected function fbpost(DomCrawler $crawler)
127
    {
128 28
        $callback = function (DomCrawler $crawler)
129
        {
130 3
            $link = $crawler->attr('cite');
0 ignored issues
show
Unused Code introduced by
The assignment to $link is dead and can be removed.
Loading history...
131
132 3
            $text = '<p>POST: ' . $crawler->attr('cite') . '</p>';
133
134 4
            $message = $crawler->filter('p > a')->first();
135
136 3
            return $text . '<p>' . $message->text() . '</p>';
137 42
        };
138
139 42
        return $this->replace($crawler, '.fb-xfbml-parse-ignore', $callback);
140
    }
141
142
    /**
143
     * Converts fbvideo elements to readable string.
144
     *
145
     * @param  \Pilipinews\Common\Crawler $crawler
146
     * @return \Pilipinews\Common\Crawler
147
     */
148
    protected function fbvideo(DomCrawler $crawler)
149
    {
150 42
        $callback = function (DomCrawler $crawler)
151
        {
152 3
            $link = $crawler->attr('data-href');
153
154 3
            return '<p>VIDEO: ' . $link . '</p>';
155 42
        };
156
157 42
        return $this->replace($crawler, '.fb-video', $callback);
158
    }
159
160
    /**
161
     * Initializes the crawler instance.
162
     *
163
     * @param  string $link
164
     * @return void
165
     */
166 42
    protected function prepare($link)
167
    {
168 42
        $response = Client::request((string) $link);
169
170 42
        $response = str_replace('<p>Click <a href="https://www.inquirer.net/philippine-typhoon-news">here</a> for more weather related news."</p>', '', $response);
171
172 42
        $response = str_replace('<p>Click <a href="https://www.inquirer.net/philippine-typhoon-news">here</a> for more weather related news.</p>', '', $response);
173
174 42
        $response = str_replace('<strong> </strong>', ' ', $response);
175
176 42
        $this->crawler = new DomCrawler($response);
177 42
    }
178
}
179