1 | <?php |
||||
2 | |||||
3 | namespace Pilipinews\Website\Rappler; |
||||
4 | |||||
5 | use Pilipinews\Common\Article; |
||||
6 | use Pilipinews\Common\Interfaces\ScraperInterface; |
||||
7 | use Pilipinews\Common\Scraper as AbstractScraper; |
||||
8 | use Pilipinews\Common\Crawler as DomCrawler; |
||||
9 | |||||
10 | /** |
||||
11 | * Rappler News Scraper |
||||
12 | * |
||||
13 | * @package Pilipinews |
||||
14 | * @author Rougin Gutib <[email protected]> |
||||
15 | */ |
||||
16 | class Scraper extends AbstractScraper implements ScraperInterface |
||||
17 | { |
||||
18 | /** |
||||
19 | * @var string[] |
||||
20 | */ |
||||
21 | protected $removables = array('.author-box'); |
||||
22 | |||||
23 | /** |
||||
24 | * @var string[] |
||||
25 | */ |
||||
26 | protected $texts = array( |
||||
27 | "What's the weather like in your area? Report the situation through Rappler's Agos (http://agos.rappler.com/) or tweet us at @rapplerdotcom (https://twitter.com/rapplerdotcom).", |
||||
28 | "Not on the list? Help us crowdsource class suspensions by posting in the comments section or tweeting @rapplerdotcom (https://twitter.com/rapplerdotcom).\n\nFor more information: (https://www.facebook.com/gov.abet/posts/10152811185356858)When are classes cancelled or suspended? (https://www.rappler.com/move-ph/31299-classes-cancelled-suspended)", |
||||
29 | "\n\nPlease refresh this page for updates." |
||||
30 | ); |
||||
31 | |||||
32 | /** |
||||
33 | * Returns the contents of an article. |
||||
34 | * |
||||
35 | * @param string $link |
||||
36 | * @return \Pilipinews\Common\Article |
||||
37 | */ |
||||
38 | 30 | public function scrape($link) |
|||
39 | { |
||||
40 | 30 | $this->prepare(mb_strtolower($link)); |
|||
41 | |||||
42 | 30 | $title = $this->title('h1'); |
|||
43 | |||||
44 | 30 | $this->remove((array) $this->removables); |
|||
45 | |||||
46 | 30 | $body = $this->body('.ArticleWrapper__ArticleBodyWrapper-sc-36pn73-0'); |
|||
47 | |||||
48 | 30 | $body = $this->image($body); |
|||
49 | |||||
50 | 30 | $body = $this->scribd($body); |
|||
51 | |||||
52 | 30 | $body = $this->video($body); |
|||
53 | |||||
54 | 30 | $body = $this->tweet($body); |
|||
55 | |||||
56 | 30 | $html = $this->html($body, $this->texts); |
|||
57 | |||||
58 | 30 | $html = htmlspecialchars_decode($html); |
|||
59 | |||||
60 | 30 | return new Article($title, $html, $link); |
|||
61 | } |
||||
62 | |||||
63 | /** |
||||
64 | * Converts image elements to readable string. |
||||
65 | * |
||||
66 | * @param \Pilipinews\Common\Crawler $crawler |
||||
67 | * @return \Pilipinews\Common\Crawler |
||||
68 | */ |
||||
69 | protected function image(DomCrawler $crawler) |
||||
70 | { |
||||
71 | 30 | $callback = function (DomCrawler $crawler, $html) |
|||
0 ignored issues
–
show
|
|||||
72 | { |
||||
73 | $image = $crawler->previousAll()->first(); |
||||
74 | |||||
75 | $photo = $image->filter('img')->attr('data-original'); |
||||
76 | |||||
77 | $node = $image->getNode((integer) 0); |
||||
78 | |||||
79 | $node->parentNode->removeChild($node); |
||||
80 | |||||
81 | if ($text = trim($crawler->first()->text())) |
||||
82 | { |
||||
83 | $text = ' - ' . $text; |
||||
84 | } |
||||
85 | |||||
86 | return '<p>PHOTO: ' . $photo . $text . '</p>'; |
||||
87 | 30 | }; |
|||
88 | |||||
89 | 30 | return $this->replace($crawler, 'p.caption', $callback); |
|||
90 | } |
||||
91 | |||||
92 | /** |
||||
93 | * Converts embedded Scribd elements to readable string. |
||||
94 | * |
||||
95 | * @param \Pilipinews\Common\Crawler $crawler |
||||
96 | * @return \Pilipinews\Common\Crawler |
||||
97 | */ |
||||
98 | protected function scribd(DomCrawler $crawler) |
||||
99 | { |
||||
100 | 30 | $callback = function (DomCrawler $crawler, $html) |
|||
0 ignored issues
–
show
The parameter
$html is not used and could be removed.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check looks for parameters that have been defined for a function or method, but which are not used in the method body. ![]() |
|||||
101 | { |
||||
102 | $title = (string) $crawler->attr('title'); |
||||
103 | |||||
104 | $link = (string) $crawler->attr('src'); |
||||
105 | |||||
106 | return '<p>' . $title . ' (' . $link . ')</p>'; |
||||
107 | 30 | }; |
|||
108 | |||||
109 | 30 | $class = (string) '.scribd_iframe_embed'; |
|||
110 | |||||
111 | 30 | return $this->replace($crawler, $class, $callback); |
|||
112 | } |
||||
113 | |||||
114 | /** |
||||
115 | * Converts embedded iframe elements to readable string. |
||||
116 | * |
||||
117 | * @param \Pilipinews\Common\Crawler $crawler |
||||
118 | * @return \Pilipinews\Common\Crawler |
||||
119 | */ |
||||
120 | protected function video(DomCrawler $crawler) |
||||
121 | { |
||||
122 | 30 | $callback = function (DomCrawler $crawler, $html) |
|||
0 ignored issues
–
show
The parameter
$html is not used and could be removed.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check looks for parameters that have been defined for a function or method, but which are not used in the method body. ![]() |
|||||
123 | { |
||||
124 | return '<p>VIDEO: ' . $crawler->attr('src') . '</p>'; |
||||
125 | 30 | }; |
|||
126 | |||||
127 | return $this->replace($crawler, 'iframe', $callback); |
||||
128 | } |
||||
129 | } |
||||
130 |
This check looks for parameters that have been defined for a function or method, but which are not used in the method body.