pilipinews /
rappler
| 1 | <?php |
||
| 2 | |||
| 3 | namespace Pilipinews\Website\Rappler; |
||
| 4 | |||
| 5 | use Pilipinews\Common\Client; |
||
| 6 | use Pilipinews\Common\Crawler as DomCrawler; |
||
| 7 | use Pilipinews\Common\Interfaces\CrawlerInterface; |
||
| 8 | |||
| 9 | /** |
||
| 10 | * Rappler News Crawler |
||
| 11 | * |
||
| 12 | * @package Pilipinews |
||
| 13 | * @author Rougin Gutib <[email protected]> |
||
| 14 | */ |
||
| 15 | class Crawler implements CrawlerInterface |
||
| 16 | { |
||
| 17 | /** |
||
| 18 | * @var string[] |
||
| 19 | */ |
||
| 20 | protected $excluded = array('IN PHOTOS', 'LIVE', 'WATCH', 'LOOK', 'Rappler Talk', 'PANOORIN'); |
||
| 21 | |||
| 22 | /** |
||
| 23 | * @var string |
||
| 24 | */ |
||
| 25 | protected $link = 'https://rappler.com/section/nation'; |
||
| 26 | |||
| 27 | /** |
||
| 28 | * @var string |
||
| 29 | */ |
||
| 30 | protected $pattern = '.A__DefaultLink-sc-120nwt8-0.eqXhhw'; |
||
| 31 | |||
| 32 | /** |
||
| 33 | * Returns an array of articles to scrape. |
||
| 34 | * |
||
| 35 | * @return string[] |
||
| 36 | */ |
||
| 37 | 3 | public function crawl() |
|
| 38 | { |
||
| 39 | 3 | $base = 'https://rappler.com'; |
|
| 40 | |||
| 41 | 3 | $excluded = $this->excluded; |
|
| 42 | |||
| 43 | 3 | $excluded = function ($text) use ($excluded) |
|
| 44 | { |
||
| 45 | 3 | preg_match('/(.*):(.*)/i', $text, $matches); |
|
| 46 | |||
| 47 | 3 | $keyword = isset($matches[1]) ? $matches[1] : null; |
|
| 48 | |||
| 49 | 3 | return in_array($keyword, (array) $excluded); |
|
| 50 | 3 | }; |
|
| 51 | |||
| 52 | 3 | $callback = function (DomCrawler $node) use ($base, $excluded) |
|
| 53 | { |
||
| 54 | 3 | $items = explode('/', $link = $node->attr('href')); |
|
| 55 | |||
| 56 | 3 | $allowed = $items[1] === 'nation' && ! $excluded($node->text()); |
|
| 57 | |||
| 58 | 3 | return $allowed ? $base . $node->attr('href') : null; |
|
| 59 | 3 | }; |
|
| 60 | |||
| 61 | $crawler = new DomCrawler(Client::request($this->link)); |
||
| 62 | |||
| 63 | $news = $crawler->filter((string) $this->pattern); |
||
| 64 | |||
| 65 | $filtered = array_filter($news->each($callback)); |
||
| 66 | |||
| 67 | $reversed = array_reverse($filtered); |
||
|
0 ignored issues
–
show
Unused Code
introduced
by
Loading history...
|
|||
| 68 | |||
| 69 | return array_values(array_unique($filtered)); |
||
| 70 | } |
||
| 71 | } |
||
| 72 |