Passed
Push — master ( b47649...99e771 )
by Rougin
02:02
created

Scraper::fbpost()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 14
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 4
CRAP Score 1.125

Importance

Changes 0
Metric Value
cc 1
eloc 6
nc 1
nop 1
dl 0
loc 14
ccs 4
cts 8
cp 0.5
crap 1.125
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace Pilipinews\Website\Inquirer;
4
5
use Pilipinews\Common\Article;
6
use Pilipinews\Common\Client;
7
use Pilipinews\Common\Crawler as DomCrawler;
8
use Pilipinews\Common\Interfaces\ScraperInterface;
9
use Pilipinews\Common\Scraper as AbstractScraper;
10
11
/**
12
 * Inquirer News Scraper
13
 *
14
 * @package Pilipinews
15
 * @author  Rougin Royce Gutib <[email protected]>
16
 */
17
class Scraper extends AbstractScraper implements ScraperInterface
18
{
19
    const TEXT_FOOTER = 'Subscribe to INQUIRER PLUS (http://www.inquirer.net/plus) to get access to The Philippine Daily Inquirer & other 70+ titles, share up to 5 gadgets, listen to the news, download as early as 4am & share articles on social media. Call 896 6000.';
20
21
    /**
22
     * @var string[]
23
     */
24
    protected $refresh = array('Refresh this page for updates.');
25
26
    /**
27
     * @var string[]
28
     */
29
    protected $removables = array(
30
        'script',
31
        '#billboard_article',
32
        '.ventuno-vid',
33
        '#article_disclaimer',
34
        '.OUTBRAIN',
35
        '#ch-follow-us',
36
        '.view-comments',
37
        '#article_tags',
38
        '.adsbygoogle',
39
        '#article-new-featured',
40
        '#read-next-2018',
41
        '#rn-lbl',
42
        '#fb-root',
43
    );
44
45
    /**
46
     * Returns the contents of an article.
47
     *
48
     * @param  string $link
49
     * @return \Pilipinews\Common\Article
50
     */
51 30
    public function scrape($link)
52
    {
53 30
        $this->prepare((string) mb_strtolower($link));
54
55 30
        $title = $this->title('.entry-title');
56
57 30
        $pattern = '/-(\d+)x(\d+).jpg/i';
58
59 30
        $this->remove((array) $this->removables);
60
61 30
        $body = $this->body('#article_content');
62
63 30
        $body = $this->caption($body);
64
65 30
        $body = $this->fbvideo($body);
66
67 30
        $body = $this->fbpost($body)->html();
68
69 30
        $body = preg_replace($pattern, '.jpg', $body);
70
71 30
        $body = $this->html(new DomCrawler($body), $this->refresh);
72
73 30
        $body = str_replace(self::TEXT_FOOTER, '', trim($body));
74
75 30
        return new Article($title, (string) trim($body));
76
    }
77
78
    /**
79
     * Converts caption elements to readable string.
80
     *
81
     * @param  \Pilipinews\Common\Crawler $crawler
82
     * @return \Pilipinews\Common\Crawler
83
     */
84 20
    protected function caption(DomCrawler $crawler)
85
    {
86 10
        $callback = function (DomCrawler $crawler)
87
        {
88 12
            $image = $crawler->filter('img')->first()->attr('src');
89
90 12
            $format = (string) '<p>PHOTO: %s</p><p>%s</p>';
91
92 12
            $text = $crawler->filter('.wp-caption-text')->first();
93
94 12
            return sprintf($format, $image, $text->html());
95 30
        };
96
97 30
        return $this->replace($crawler, '.wp-caption', $callback);
98
    }
99
100
    /**
101
     * Converts Facebook embedded posts to readable string.
102
     *
103
     * @param  \Pilipinews\Common\Crawler $crawler
104
     * @return \Pilipinews\Common\Crawler
105
     */
106 20
    protected function fbpost(DomCrawler $crawler)
107
    {
108 10
        $callback = function (DomCrawler $crawler)
109
        {
110
            $link = $crawler->attr('cite');
0 ignored issues
show
Unused Code introduced by
The assignment to $link is dead and can be removed.
Loading history...
111
112
            $text = '<p>POST: ' . $crawler->attr('cite') . '</p>';
113
114
            $message = $crawler->filter('p > a')->first();
115
116
            return $text . '<p>' . $message->text() . '</p>';
117 30
        };
118
119 30
        return $this->replace($crawler, '.fb-xfbml-parse-ignore', $callback);
120
    }
121
122
    /**
123
     * Converts fbvideo elements to readable string.
124
     *
125
     * @param  \Pilipinews\Common\Crawler $crawler
126
     * @return \Pilipinews\Common\Crawler
127
     */
128
    protected function fbvideo(DomCrawler $crawler)
129
    {
130 30
        $callback = function (DomCrawler $crawler)
131
        {
132 3
            $link = $crawler->attr('data-href');
133
134 3
            return '<p>VIDEO: ' . $link . '</p>';
135 30
        };
136
137 30
        return $this->replace($crawler, '.fb-video', $callback);
138
    }
139
140
    /**
141
     * Initializes the crawler instance.
142
     *
143
     * @param  string $link
144
     * @return void
145
     */
146 30
    protected function prepare($link)
147
    {
148 30
        $response = Client::request((string) $link);
149
150 30
        $response = str_replace('<p>Click <a href="https://www.inquirer.net/philippine-typhoon-news">here</a> for more weather related news."</p>', '', $response);
151
152 30
        $response = str_replace('<p>Click <a href="https://www.inquirer.net/philippine-typhoon-news">here</a> for more weather related news.</p>', '', $response);
153
154 30
        $response = str_replace('<strong> </strong>', ' ', $response);
155
156 30
        $this->crawler = new DomCrawler($response);
157 30
    }
158
}
159