pilipinews /
inquirer
| 1 | <?php |
||
| 2 | |||
| 3 | namespace Pilipinews\Website\Inquirer; |
||
| 4 | |||
| 5 | use Pilipinews\Common\Article; |
||
| 6 | use Pilipinews\Common\Client; |
||
| 7 | use Pilipinews\Common\Crawler as DomCrawler; |
||
| 8 | use Pilipinews\Common\Interfaces\ScraperInterface; |
||
| 9 | use Pilipinews\Common\Scraper as AbstractScraper; |
||
| 10 | |||
| 11 | /** |
||
| 12 | * Inquirer News Scraper |
||
| 13 | * |
||
| 14 | * @package Pilipinews |
||
| 15 | * @author Rougin Gutib <[email protected]> |
||
| 16 | */ |
||
| 17 | class Scraper extends AbstractScraper implements ScraperInterface |
||
| 18 | { |
||
| 19 | const TEXT_FOOTER = 'Subscribe to INQUIRER PLUS (https://www.inquirer.net/plus) to get access to The Philippine Daily Inquirer & other 70+ titles, share up to 5 gadgets, listen to the news, download as early as 4am & share articles on social media. Call 896 6000.'; |
||
| 20 | |||
| 21 | const TEXT_FOOTER2 = 'Subscribe to INQUIRER PLUS (https://www.inquirer.net/plus) to get access to The Philippine Daily Inquirer & other 70+ titles, share up to 5 gadgets, listen to the news, download as early as 4am & share articles on social media. Call 896 6000.'; |
||
| 22 | |||
| 23 | const TEXT_FOOTER3 = "\n\n" . 'For more news about the novel coronavirus click here. (https://www.inquirer.net/novel-coronavirus-update)'; |
||
| 24 | |||
| 25 | const TEXT_FOOTER4 = "\n\n" . 'What you need to know about Coronavirus. (https://newsinfo.inquirer.net/1243479/coronavirus-pandemic-2020-everything-you-need-to-know)'; |
||
| 26 | |||
| 27 | /** |
||
| 28 | * @var string[] |
||
| 29 | */ |
||
| 30 | protected $refresh = array('Refresh this page for updates.'); |
||
| 31 | |||
| 32 | /** |
||
| 33 | * @var string[] |
||
| 34 | */ |
||
| 35 | protected $removables = array( |
||
| 36 | '#ms-slider-wrap', |
||
| 37 | '#mr-2018-wrap', |
||
| 38 | 'script', |
||
| 39 | '#billboard_article', |
||
| 40 | '.ventuno-vid', |
||
| 41 | '#article_disclaimer', |
||
| 42 | '.OUTBRAIN', |
||
| 43 | '#ch-follow-us', |
||
| 44 | '.view-comments', |
||
| 45 | '#article_tags', |
||
| 46 | '.adsbygoogle', |
||
| 47 | '#article-new-featured', |
||
| 48 | '#read-next-2018', |
||
| 49 | '#rn-lbl', |
||
| 50 | '#fb-root', |
||
| 51 | '#lsmr-lbl', |
||
| 52 | '#lsmr-box', |
||
| 53 | '.bb_iawr', |
||
| 54 | 'style', |
||
| 55 | '.ob_amplifypixel', |
||
| 56 | '.lazyload', |
||
| 57 | ); |
||
| 58 | |||
| 59 | /** |
||
| 60 | * Returns the contents of an article. |
||
| 61 | * |
||
| 62 | * @param string $link |
||
| 63 | * @return \Pilipinews\Common\Article |
||
| 64 | */ |
||
| 65 | 42 | public function scrape($link) |
|
| 66 | { |
||
| 67 | 42 | $this->prepare(mb_strtolower($link)); |
|
| 68 | |||
| 69 | 42 | $title = $this->title('.entry-title'); |
|
| 70 | |||
| 71 | 42 | $pattern = '/-(\d+)x(\d+).jpg/i'; |
|
| 72 | |||
| 73 | 42 | $this->remove((array) $this->removables); |
|
| 74 | |||
| 75 | 42 | $body = $this->body('#article_content'); |
|
| 76 | |||
| 77 | 42 | $body = $this->caption($body); |
|
| 78 | |||
| 79 | 42 | $body = $this->fbvideo($body); |
|
| 80 | |||
| 81 | 42 | $body = $this->fbpost($body)->html(); |
|
| 82 | |||
| 83 | 42 | $body = preg_replace($pattern, '.jpg', $body); |
|
| 84 | |||
| 85 | 42 | $body = $this->html(new DomCrawler($body), $this->refresh); |
|
| 86 | |||
| 87 | 42 | $body = str_replace(self::TEXT_FOOTER3, '', trim($body)); |
|
| 88 | |||
| 89 | 42 | $body = str_replace(self::TEXT_FOOTER, '', trim($body)); |
|
| 90 | |||
| 91 | 42 | $body = str_replace(self::TEXT_FOOTER4, '', trim($body)); |
|
| 92 | |||
| 93 | 42 | $body = str_replace(self::TEXT_FOOTER2, '', trim($body)); |
|
| 94 | |||
| 95 | 42 | return new Article($title, trim($body), (string) $link); |
|
| 96 | } |
||
| 97 | |||
| 98 | /** |
||
| 99 | * Converts caption elements to readable string. |
||
| 100 | * |
||
| 101 | * @param \Pilipinews\Common\Crawler $crawler |
||
| 102 | * @return \Pilipinews\Common\Crawler |
||
| 103 | */ |
||
| 104 | 14 | protected function caption(DomCrawler $crawler) |
|
| 105 | { |
||
| 106 | 28 | $callback = function (DomCrawler $crawler) |
|
| 107 | { |
||
| 108 | 21 | $image = $crawler->filter('img')->first()->attr('src'); |
|
| 109 | |||
| 110 | 21 | $format = (string) '<p>PHOTO: %s - %s</p>'; |
|
| 111 | |||
| 112 | 21 | $text = $crawler->filter('.wp-caption-text')->first(); |
|
| 113 | |||
| 114 | 21 | return sprintf($format, $image, $text->html()); |
|
| 115 | 42 | }; |
|
| 116 | |||
| 117 | 42 | return $this->replace($crawler, '.wp-caption', $callback); |
|
| 118 | } |
||
| 119 | |||
| 120 | /** |
||
| 121 | * Converts Facebook embedded posts to readable string. |
||
| 122 | * |
||
| 123 | * @param \Pilipinews\Common\Crawler $crawler |
||
| 124 | * @return \Pilipinews\Common\Crawler |
||
| 125 | */ |
||
| 126 | 14 | protected function fbpost(DomCrawler $crawler) |
|
| 127 | { |
||
| 128 | 28 | $callback = function (DomCrawler $crawler) |
|
| 129 | { |
||
| 130 | 3 | $link = $crawler->attr('cite'); |
|
|
0 ignored issues
–
show
Unused Code
introduced
by
Loading history...
|
|||
| 131 | |||
| 132 | 3 | $text = '<p>POST: ' . $crawler->attr('cite') . '</p>'; |
|
| 133 | |||
| 134 | 4 | $message = $crawler->filter('p > a')->first(); |
|
| 135 | |||
| 136 | 3 | return $text . '<p>' . $message->text() . '</p>'; |
|
| 137 | 42 | }; |
|
| 138 | |||
| 139 | 42 | return $this->replace($crawler, '.fb-xfbml-parse-ignore', $callback); |
|
| 140 | } |
||
| 141 | |||
| 142 | /** |
||
| 143 | * Converts fbvideo elements to readable string. |
||
| 144 | * |
||
| 145 | * @param \Pilipinews\Common\Crawler $crawler |
||
| 146 | * @return \Pilipinews\Common\Crawler |
||
| 147 | */ |
||
| 148 | protected function fbvideo(DomCrawler $crawler) |
||
| 149 | { |
||
| 150 | 42 | $callback = function (DomCrawler $crawler) |
|
| 151 | { |
||
| 152 | 3 | $link = $crawler->attr('data-href'); |
|
| 153 | |||
| 154 | 3 | return '<p>VIDEO: ' . $link . '</p>'; |
|
| 155 | 42 | }; |
|
| 156 | |||
| 157 | 42 | return $this->replace($crawler, '.fb-video', $callback); |
|
| 158 | } |
||
| 159 | |||
| 160 | /** |
||
| 161 | * Initializes the crawler instance. |
||
| 162 | * |
||
| 163 | * @param string $link |
||
| 164 | * @return void |
||
| 165 | */ |
||
| 166 | 42 | protected function prepare($link) |
|
| 167 | { |
||
| 168 | 42 | $response = Client::request((string) $link); |
|
| 169 | |||
| 170 | 42 | $response = str_replace('<p>Click <a href="https://www.inquirer.net/philippine-typhoon-news">here</a> for more weather related news."</p>', '', $response); |
|
| 171 | |||
| 172 | 42 | $response = str_replace('<p>Click <a href="https://www.inquirer.net/philippine-typhoon-news">here</a> for more weather related news.</p>', '', $response); |
|
| 173 | |||
| 174 | 42 | $response = str_replace('<strong> </strong>', ' ', $response); |
|
| 175 | |||
| 176 | 42 | $this->crawler = new DomCrawler($response); |
|
| 177 | 42 | } |
|
| 178 | } |
||
| 179 |