pilipinews /
rappler
| 1 | <?php |
||||
| 2 | |||||
| 3 | namespace Pilipinews\Website\Rappler; |
||||
| 4 | |||||
| 5 | use Pilipinews\Common\Article; |
||||
| 6 | use Pilipinews\Common\Interfaces\ScraperInterface; |
||||
| 7 | use Pilipinews\Common\Scraper as AbstractScraper; |
||||
| 8 | use Pilipinews\Common\Crawler as DomCrawler; |
||||
| 9 | |||||
| 10 | /** |
||||
| 11 | * Rappler News Scraper |
||||
| 12 | * |
||||
| 13 | * @package Pilipinews |
||||
| 14 | * @author Rougin Gutib <[email protected]> |
||||
| 15 | */ |
||||
| 16 | class Scraper extends AbstractScraper implements ScraperInterface |
||||
| 17 | { |
||||
| 18 | /** |
||||
| 19 | * @var string[] |
||||
| 20 | */ |
||||
| 21 | protected $removables = array('.author-box'); |
||||
| 22 | |||||
| 23 | /** |
||||
| 24 | * @var string[] |
||||
| 25 | */ |
||||
| 26 | protected $texts = array( |
||||
| 27 | "What's the weather like in your area? Report the situation through Rappler's Agos (http://agos.rappler.com/) or tweet us at @rapplerdotcom (https://twitter.com/rapplerdotcom).", |
||||
| 28 | "Not on the list? Help us crowdsource class suspensions by posting in the comments section or tweeting @rapplerdotcom (https://twitter.com/rapplerdotcom).\n\nFor more information: (https://www.facebook.com/gov.abet/posts/10152811185356858)When are classes cancelled or suspended? (https://www.rappler.com/move-ph/31299-classes-cancelled-suspended)", |
||||
| 29 | "\n\nPlease refresh this page for updates." |
||||
| 30 | ); |
||||
| 31 | |||||
| 32 | /** |
||||
| 33 | * Returns the contents of an article. |
||||
| 34 | * |
||||
| 35 | * @param string $link |
||||
| 36 | * @return \Pilipinews\Common\Article |
||||
| 37 | */ |
||||
| 38 | 30 | public function scrape($link) |
|||
| 39 | { |
||||
| 40 | 30 | $this->prepare(mb_strtolower($link)); |
|||
| 41 | |||||
| 42 | 30 | $title = $this->title('h1'); |
|||
| 43 | |||||
| 44 | 30 | $this->remove((array) $this->removables); |
|||
| 45 | |||||
| 46 | 30 | $body = $this->body('.ArticleWrapper__ArticleBodyWrapper-sc-36pn73-0'); |
|||
| 47 | |||||
| 48 | 30 | $body = $this->image($body); |
|||
| 49 | |||||
| 50 | 30 | $body = $this->scribd($body); |
|||
| 51 | |||||
| 52 | 30 | $body = $this->video($body); |
|||
| 53 | |||||
| 54 | 30 | $body = $this->tweet($body); |
|||
| 55 | |||||
| 56 | 30 | $html = $this->html($body, $this->texts); |
|||
| 57 | |||||
| 58 | 30 | $html = htmlspecialchars_decode($html); |
|||
| 59 | |||||
| 60 | 30 | return new Article($title, $html, $link); |
|||
| 61 | } |
||||
| 62 | |||||
| 63 | /** |
||||
| 64 | * Converts image elements to readable string. |
||||
| 65 | * |
||||
| 66 | * @param \Pilipinews\Common\Crawler $crawler |
||||
| 67 | * @return \Pilipinews\Common\Crawler |
||||
| 68 | */ |
||||
| 69 | protected function image(DomCrawler $crawler) |
||||
| 70 | { |
||||
| 71 | 30 | $callback = function (DomCrawler $crawler, $html) |
|||
|
0 ignored issues
–
show
|
|||||
| 72 | { |
||||
| 73 | $image = $crawler->previousAll()->first(); |
||||
| 74 | |||||
| 75 | $photo = $image->filter('img')->attr('data-original'); |
||||
| 76 | |||||
| 77 | $node = $image->getNode((integer) 0); |
||||
| 78 | |||||
| 79 | $node->parentNode->removeChild($node); |
||||
| 80 | |||||
| 81 | if ($text = trim($crawler->first()->text())) |
||||
| 82 | { |
||||
| 83 | $text = ' - ' . $text; |
||||
| 84 | } |
||||
| 85 | |||||
| 86 | return '<p>PHOTO: ' . $photo . $text . '</p>'; |
||||
| 87 | 30 | }; |
|||
| 88 | |||||
| 89 | 30 | return $this->replace($crawler, 'p.caption', $callback); |
|||
| 90 | } |
||||
| 91 | |||||
| 92 | /** |
||||
| 93 | * Converts embedded Scribd elements to readable string. |
||||
| 94 | * |
||||
| 95 | * @param \Pilipinews\Common\Crawler $crawler |
||||
| 96 | * @return \Pilipinews\Common\Crawler |
||||
| 97 | */ |
||||
| 98 | protected function scribd(DomCrawler $crawler) |
||||
| 99 | { |
||||
| 100 | 30 | $callback = function (DomCrawler $crawler, $html) |
|||
|
0 ignored issues
–
show
The parameter
$html is not used and could be removed.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check looks for parameters that have been defined for a function or method, but which are not used in the method body. Loading history...
|
|||||
| 101 | { |
||||
| 102 | $title = (string) $crawler->attr('title'); |
||||
| 103 | |||||
| 104 | $link = (string) $crawler->attr('src'); |
||||
| 105 | |||||
| 106 | return '<p>' . $title . ' (' . $link . ')</p>'; |
||||
| 107 | 30 | }; |
|||
| 108 | |||||
| 109 | 30 | $class = (string) '.scribd_iframe_embed'; |
|||
| 110 | |||||
| 111 | 30 | return $this->replace($crawler, $class, $callback); |
|||
| 112 | } |
||||
| 113 | |||||
| 114 | /** |
||||
| 115 | * Converts embedded iframe elements to readable string. |
||||
| 116 | * |
||||
| 117 | * @param \Pilipinews\Common\Crawler $crawler |
||||
| 118 | * @return \Pilipinews\Common\Crawler |
||||
| 119 | */ |
||||
| 120 | protected function video(DomCrawler $crawler) |
||||
| 121 | { |
||||
| 122 | 30 | $callback = function (DomCrawler $crawler, $html) |
|||
|
0 ignored issues
–
show
The parameter
$html is not used and could be removed.
(
Ignorable by Annotation
)
If this is a false-positive, you can also ignore this issue in your code via the
This check looks for parameters that have been defined for a function or method, but which are not used in the method body. Loading history...
|
|||||
| 123 | { |
||||
| 124 | return '<p>VIDEO: ' . $crawler->attr('src') . '</p>'; |
||||
| 125 | 30 | }; |
|||
| 126 | |||||
| 127 | return $this->replace($crawler, 'iframe', $callback); |
||||
| 128 | } |
||||
| 129 | } |
||||
| 130 |
This check looks for parameters that have been defined for a function or method, but which are not used in the method body.