1 | <?php |
||
2 | |||
3 | namespace Pilipinews\Website\Inquirer; |
||
4 | |||
5 | use Pilipinews\Common\Article; |
||
6 | use Pilipinews\Common\Client; |
||
7 | use Pilipinews\Common\Crawler as DomCrawler; |
||
8 | use Pilipinews\Common\Interfaces\ScraperInterface; |
||
9 | use Pilipinews\Common\Scraper as AbstractScraper; |
||
10 | |||
11 | /** |
||
12 | * Inquirer News Scraper |
||
13 | * |
||
14 | * @package Pilipinews |
||
15 | * @author Rougin Gutib <[email protected]> |
||
16 | */ |
||
17 | class Scraper extends AbstractScraper implements ScraperInterface |
||
18 | { |
||
19 | const TEXT_FOOTER = 'Subscribe to INQUIRER PLUS (https://www.inquirer.net/plus) to get access to The Philippine Daily Inquirer & other 70+ titles, share up to 5 gadgets, listen to the news, download as early as 4am & share articles on social media. Call 896 6000.'; |
||
20 | |||
21 | const TEXT_FOOTER2 = 'Subscribe to INQUIRER PLUS (https://www.inquirer.net/plus) to get access to The Philippine Daily Inquirer & other 70+ titles, share up to 5 gadgets, listen to the news, download as early as 4am & share articles on social media. Call 896 6000.'; |
||
22 | |||
23 | const TEXT_FOOTER3 = "\n\n" . 'For more news about the novel coronavirus click here. (https://www.inquirer.net/novel-coronavirus-update)'; |
||
24 | |||
25 | const TEXT_FOOTER4 = "\n\n" . 'What you need to know about Coronavirus. (https://newsinfo.inquirer.net/1243479/coronavirus-pandemic-2020-everything-you-need-to-know)'; |
||
26 | |||
27 | /** |
||
28 | * @var string[] |
||
29 | */ |
||
30 | protected $refresh = array('Refresh this page for updates.'); |
||
31 | |||
32 | /** |
||
33 | * @var string[] |
||
34 | */ |
||
35 | protected $removables = array( |
||
36 | '#ms-slider-wrap', |
||
37 | '#mr-2018-wrap', |
||
38 | 'script', |
||
39 | '#billboard_article', |
||
40 | '.ventuno-vid', |
||
41 | '#article_disclaimer', |
||
42 | '.OUTBRAIN', |
||
43 | '#ch-follow-us', |
||
44 | '.view-comments', |
||
45 | '#article_tags', |
||
46 | '.adsbygoogle', |
||
47 | '#article-new-featured', |
||
48 | '#read-next-2018', |
||
49 | '#rn-lbl', |
||
50 | '#fb-root', |
||
51 | '#lsmr-lbl', |
||
52 | '#lsmr-box', |
||
53 | '.bb_iawr', |
||
54 | 'style', |
||
55 | '.ob_amplifypixel', |
||
56 | '.lazyload', |
||
57 | ); |
||
58 | |||
59 | /** |
||
60 | * Returns the contents of an article. |
||
61 | * |
||
62 | * @param string $link |
||
63 | * @return \Pilipinews\Common\Article |
||
64 | */ |
||
65 | 42 | public function scrape($link) |
|
66 | { |
||
67 | 42 | $this->prepare(mb_strtolower($link)); |
|
68 | |||
69 | 42 | $title = $this->title('.entry-title'); |
|
70 | |||
71 | 42 | $pattern = '/-(\d+)x(\d+).jpg/i'; |
|
72 | |||
73 | 42 | $this->remove((array) $this->removables); |
|
74 | |||
75 | 42 | $body = $this->body('#article_content'); |
|
76 | |||
77 | 42 | $body = $this->caption($body); |
|
78 | |||
79 | 42 | $body = $this->fbvideo($body); |
|
80 | |||
81 | 42 | $body = $this->fbpost($body)->html(); |
|
82 | |||
83 | 42 | $body = preg_replace($pattern, '.jpg', $body); |
|
84 | |||
85 | 42 | $body = $this->html(new DomCrawler($body), $this->refresh); |
|
86 | |||
87 | 42 | $body = str_replace(self::TEXT_FOOTER3, '', trim($body)); |
|
88 | |||
89 | 42 | $body = str_replace(self::TEXT_FOOTER, '', trim($body)); |
|
90 | |||
91 | 42 | $body = str_replace(self::TEXT_FOOTER4, '', trim($body)); |
|
92 | |||
93 | 42 | $body = str_replace(self::TEXT_FOOTER2, '', trim($body)); |
|
94 | |||
95 | 42 | return new Article($title, trim($body), (string) $link); |
|
96 | } |
||
97 | |||
98 | /** |
||
99 | * Converts caption elements to readable string. |
||
100 | * |
||
101 | * @param \Pilipinews\Common\Crawler $crawler |
||
102 | * @return \Pilipinews\Common\Crawler |
||
103 | */ |
||
104 | 14 | protected function caption(DomCrawler $crawler) |
|
105 | { |
||
106 | 28 | $callback = function (DomCrawler $crawler) |
|
107 | { |
||
108 | 21 | $image = $crawler->filter('img')->first()->attr('src'); |
|
109 | |||
110 | 21 | $format = (string) '<p>PHOTO: %s - %s</p>'; |
|
111 | |||
112 | 21 | $text = $crawler->filter('.wp-caption-text')->first(); |
|
113 | |||
114 | 21 | return sprintf($format, $image, $text->html()); |
|
115 | 42 | }; |
|
116 | |||
117 | 42 | return $this->replace($crawler, '.wp-caption', $callback); |
|
118 | } |
||
119 | |||
120 | /** |
||
121 | * Converts Facebook embedded posts to readable string. |
||
122 | * |
||
123 | * @param \Pilipinews\Common\Crawler $crawler |
||
124 | * @return \Pilipinews\Common\Crawler |
||
125 | */ |
||
126 | 14 | protected function fbpost(DomCrawler $crawler) |
|
127 | { |
||
128 | 28 | $callback = function (DomCrawler $crawler) |
|
129 | { |
||
130 | 3 | $link = $crawler->attr('cite'); |
|
0 ignored issues
–
show
Unused Code
introduced
by
![]() |
|||
131 | |||
132 | 3 | $text = '<p>POST: ' . $crawler->attr('cite') . '</p>'; |
|
133 | |||
134 | 4 | $message = $crawler->filter('p > a')->first(); |
|
135 | |||
136 | 3 | return $text . '<p>' . $message->text() . '</p>'; |
|
137 | 42 | }; |
|
138 | |||
139 | 42 | return $this->replace($crawler, '.fb-xfbml-parse-ignore', $callback); |
|
140 | } |
||
141 | |||
142 | /** |
||
143 | * Converts fbvideo elements to readable string. |
||
144 | * |
||
145 | * @param \Pilipinews\Common\Crawler $crawler |
||
146 | * @return \Pilipinews\Common\Crawler |
||
147 | */ |
||
148 | protected function fbvideo(DomCrawler $crawler) |
||
149 | { |
||
150 | 42 | $callback = function (DomCrawler $crawler) |
|
151 | { |
||
152 | 3 | $link = $crawler->attr('data-href'); |
|
153 | |||
154 | 3 | return '<p>VIDEO: ' . $link . '</p>'; |
|
155 | 42 | }; |
|
156 | |||
157 | 42 | return $this->replace($crawler, '.fb-video', $callback); |
|
158 | } |
||
159 | |||
160 | /** |
||
161 | * Initializes the crawler instance. |
||
162 | * |
||
163 | * @param string $link |
||
164 | * @return void |
||
165 | */ |
||
166 | 42 | protected function prepare($link) |
|
167 | { |
||
168 | 42 | $response = Client::request((string) $link); |
|
169 | |||
170 | 42 | $response = str_replace('<p>Click <a href="https://www.inquirer.net/philippine-typhoon-news">here</a> for more weather related news."</p>', '', $response); |
|
171 | |||
172 | 42 | $response = str_replace('<p>Click <a href="https://www.inquirer.net/philippine-typhoon-news">here</a> for more weather related news.</p>', '', $response); |
|
173 | |||
174 | 42 | $response = str_replace('<strong> </strong>', ' ', $response); |
|
175 | |||
176 | 42 | $this->crawler = new DomCrawler($response); |
|
177 | 42 | } |
|
178 | } |
||
179 |