1 | <?php |
||
2 | |||
3 | namespace Pilipinews\Website\Gma; |
||
4 | |||
5 | use Pilipinews\Common\Article; |
||
6 | use Pilipinews\Common\Client; |
||
7 | use Pilipinews\Common\Converter; |
||
8 | use Pilipinews\Common\Crawler as DomCrawler; |
||
9 | use Pilipinews\Common\Interfaces\ScraperInterface; |
||
10 | use Pilipinews\Common\Scraper as AbstractScraper; |
||
11 | |||
12 | /** |
||
13 | * GMA News Scraper |
||
14 | * |
||
15 | * @package Pilipinews |
||
16 | * @author Rougin Gutib <[email protected]> |
||
17 | */ |
||
18 | class Scraper extends AbstractScraper implements ScraperInterface |
||
19 | { |
||
20 | /** |
||
21 | * Returns the contents of an article. |
||
22 | * |
||
23 | * @param string $link |
||
24 | * @return \Pilipinews\Common\Article |
||
25 | */ |
||
26 | 9 | public function scrape($link) |
|
27 | { |
||
28 | 9 | $this->prepare(mb_strtolower($link)); |
|
29 | |||
30 | 9 | $title = $this->json['headline']; |
|
31 | |||
32 | 9 | $title = str_replace(' | News |', '', $title); |
|
33 | |||
34 | 9 | $converter = new Converter; |
|
35 | |||
36 | 9 | $title = $converter->convert($title); |
|
37 | |||
38 | 9 | $body = $this->tweet($this->crawler); |
|
39 | |||
40 | 9 | $html = (string) $this->html($body); |
|
41 | |||
42 | 9 | return new Article($title, $html, $link); |
|
43 | 1 | } |
|
44 | |||
45 | /** |
||
46 | * Initializes the crawler instance. |
||
47 | * |
||
48 | * @param string $link |
||
49 | * @return void |
||
50 | */ |
||
51 | 9 | protected function prepare($link) |
|
52 | 1 | { |
|
53 | 9 | $response = (string) Client::request((string) $link); |
|
54 | |||
55 | 9 | $html = trim(preg_replace('/\s+/', ' ', $response)); |
|
56 | |||
57 | 9 | $html = str_replace('<p> <strong>', '<p><strong>', $html); |
|
58 | |||
59 | 9 | $html = str_replace('<br /> ', '<br />', $html); |
|
60 | |||
61 | 9 | preg_match('/<script type="application\/ld\+json"\>(.*?)<\/script\>/i', $html, $match); |
|
62 | |||
63 | 9 | $this->json = json_decode($match[1], true); |
|
0 ignored issues
–
show
Bug
Best Practice
introduced
by
![]() |
|||
64 | |||
65 | 9 | $content = (string) $this->json['articleBody']; |
|
66 | |||
67 | 9 | $this->crawler = new DomCrawler((string) $content); |
|
68 | 9 | } |
|
69 | } |
||
70 |