pilipinews /
gma
| 1 | <?php |
||
| 2 | |||
| 3 | namespace Pilipinews\Website\Gma; |
||
| 4 | |||
| 5 | use Pilipinews\Common\Article; |
||
| 6 | use Pilipinews\Common\Client; |
||
| 7 | use Pilipinews\Common\Converter; |
||
| 8 | use Pilipinews\Common\Crawler as DomCrawler; |
||
| 9 | use Pilipinews\Common\Interfaces\ScraperInterface; |
||
| 10 | use Pilipinews\Common\Scraper as AbstractScraper; |
||
| 11 | |||
| 12 | /** |
||
| 13 | * GMA News Scraper |
||
| 14 | * |
||
| 15 | * @package Pilipinews |
||
| 16 | * @author Rougin Gutib <[email protected]> |
||
| 17 | */ |
||
| 18 | class Scraper extends AbstractScraper implements ScraperInterface |
||
| 19 | { |
||
| 20 | /** |
||
| 21 | * Returns the contents of an article. |
||
| 22 | * |
||
| 23 | * @param string $link |
||
| 24 | * @return \Pilipinews\Common\Article |
||
| 25 | */ |
||
| 26 | 9 | public function scrape($link) |
|
| 27 | { |
||
| 28 | 9 | $this->prepare(mb_strtolower($link)); |
|
| 29 | |||
| 30 | 9 | $title = $this->json['headline']; |
|
| 31 | |||
| 32 | 9 | $title = str_replace(' | News |', '', $title); |
|
| 33 | |||
| 34 | 9 | $converter = new Converter; |
|
| 35 | |||
| 36 | 9 | $title = $converter->convert($title); |
|
| 37 | |||
| 38 | 9 | $body = $this->tweet($this->crawler); |
|
| 39 | |||
| 40 | 9 | $html = (string) $this->html($body); |
|
| 41 | |||
| 42 | 9 | return new Article($title, $html, $link); |
|
| 43 | 1 | } |
|
| 44 | |||
| 45 | /** |
||
| 46 | * Initializes the crawler instance. |
||
| 47 | * |
||
| 48 | * @param string $link |
||
| 49 | * @return void |
||
| 50 | */ |
||
| 51 | 9 | protected function prepare($link) |
|
| 52 | 1 | { |
|
| 53 | 9 | $response = (string) Client::request((string) $link); |
|
| 54 | |||
| 55 | 9 | $html = trim(preg_replace('/\s+/', ' ', $response)); |
|
| 56 | |||
| 57 | 9 | $html = str_replace('<p> <strong>', '<p><strong>', $html); |
|
| 58 | |||
| 59 | 9 | $html = str_replace('<br /> ', '<br />', $html); |
|
| 60 | |||
| 61 | 9 | preg_match('/<script type="application\/ld\+json"\>(.*?)<\/script\>/i', $html, $match); |
|
| 62 | |||
| 63 | 9 | $this->json = json_decode($match[1], true); |
|
|
0 ignored issues
–
show
Bug
Best Practice
introduced
by
Loading history...
|
|||
| 64 | |||
| 65 | 9 | $content = (string) $this->json['articleBody']; |
|
| 66 | |||
| 67 | 9 | $this->crawler = new DomCrawler((string) $content); |
|
| 68 | 9 | } |
|
| 69 | } |
||
| 70 |