Issues (4)

src/Crawler.php (1 issue)

1
<?php
2
3
namespace Pilipinews\Website\Rappler;
4
5
use Pilipinews\Common\Client;
6
use Pilipinews\Common\Crawler as DomCrawler;
7
use Pilipinews\Common\Interfaces\CrawlerInterface;
8
9
/**
10
 * Rappler News Crawler
11
 *
12
 * @package Pilipinews
13
 * @author  Rougin Gutib <[email protected]>
14
 */
15
class Crawler implements CrawlerInterface
16
{
17
    /**
18
     * @var string[]
19
     */
20
    protected $excluded = array('IN PHOTOS', 'LIVE', 'WATCH', 'LOOK', 'Rappler Talk', 'PANOORIN');
21
22
    /**
23
     * @var string
24
     */
25
    protected $link = 'https://rappler.com/section/nation';
26
27
    /**
28
     * @var string
29
     */
30
    protected $pattern = '.A__DefaultLink-sc-120nwt8-0.eqXhhw';
31
32
    /**
33
     * Returns an array of articles to scrape.
34
     *
35
     * @return string[]
36
     */
37 3
    public function crawl()
38
    {
39 3
        $base = 'https://rappler.com';
40
41 3
        $excluded = $this->excluded;
42
43 3
        $excluded = function ($text) use ($excluded)
44
        {
45 3
            preg_match('/(.*):(.*)/i', $text, $matches);
46
47 3
            $keyword = isset($matches[1]) ? $matches[1] : null;
48
49 3
            return in_array($keyword, (array) $excluded);
50 3
        };
51
52 3
        $callback = function (DomCrawler $node) use ($base, $excluded)
53
        {
54 3
            $items = explode('/', $link = $node->attr('href'));
55
56 3
            $allowed = $items[1] === 'nation' && ! $excluded($node->text());
57
58 3
            return $allowed ? $base . $node->attr('href') : null;
59 3
        };
60
61
        $crawler = new DomCrawler(Client::request($this->link));
62
63
        $news = $crawler->filter((string) $this->pattern);
64
65
        $filtered = array_filter($news->each($callback));
66
67
        $reversed = array_reverse($filtered);
0 ignored issues
show
The assignment to $reversed is dead and can be removed.
Loading history...
68
69
        return array_values(array_unique($filtered));
70
    }
71
}
72