1 | <?php |
||
2 | |||
3 | namespace Pilipinews\Website\Rappler; |
||
4 | |||
5 | use Pilipinews\Common\Client; |
||
6 | use Pilipinews\Common\Crawler as DomCrawler; |
||
7 | use Pilipinews\Common\Interfaces\CrawlerInterface; |
||
8 | |||
9 | /** |
||
10 | * Rappler News Crawler |
||
11 | * |
||
12 | * @package Pilipinews |
||
13 | * @author Rougin Gutib <[email protected]> |
||
14 | */ |
||
15 | class Crawler implements CrawlerInterface |
||
16 | { |
||
17 | /** |
||
18 | * @var string[] |
||
19 | */ |
||
20 | protected $excluded = array('IN PHOTOS', 'LIVE', 'WATCH', 'LOOK', 'Rappler Talk', 'PANOORIN'); |
||
21 | |||
22 | /** |
||
23 | * @var string |
||
24 | */ |
||
25 | protected $link = 'https://rappler.com/section/nation'; |
||
26 | |||
27 | /** |
||
28 | * @var string |
||
29 | */ |
||
30 | protected $pattern = '.A__DefaultLink-sc-120nwt8-0.eqXhhw'; |
||
31 | |||
32 | /** |
||
33 | * Returns an array of articles to scrape. |
||
34 | * |
||
35 | * @return string[] |
||
36 | */ |
||
37 | 3 | public function crawl() |
|
38 | { |
||
39 | 3 | $base = 'https://rappler.com'; |
|
40 | |||
41 | 3 | $excluded = $this->excluded; |
|
42 | |||
43 | 3 | $excluded = function ($text) use ($excluded) |
|
44 | { |
||
45 | 3 | preg_match('/(.*):(.*)/i', $text, $matches); |
|
46 | |||
47 | 3 | $keyword = isset($matches[1]) ? $matches[1] : null; |
|
48 | |||
49 | 3 | return in_array($keyword, (array) $excluded); |
|
50 | 3 | }; |
|
51 | |||
52 | 3 | $callback = function (DomCrawler $node) use ($base, $excluded) |
|
53 | { |
||
54 | 3 | $items = explode('/', $link = $node->attr('href')); |
|
55 | |||
56 | 3 | $allowed = $items[1] === 'nation' && ! $excluded($node->text()); |
|
57 | |||
58 | 3 | return $allowed ? $base . $node->attr('href') : null; |
|
59 | 3 | }; |
|
60 | |||
61 | $crawler = new DomCrawler(Client::request($this->link)); |
||
62 | |||
63 | $news = $crawler->filter((string) $this->pattern); |
||
64 | |||
65 | $filtered = array_filter($news->each($callback)); |
||
66 | |||
67 | $reversed = array_reverse($filtered); |
||
0 ignored issues
–
show
Unused Code
introduced
by
![]() |
|||
68 | |||
69 | return array_values(array_unique($filtered)); |
||
70 | } |
||
71 | } |
||
72 |