1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Pilipinews\Website\Abscbn; |
4
|
|
|
|
5
|
|
|
use Pilipinews\Common\Article; |
6
|
|
|
use Pilipinews\Common\Crawler as DomCrawler; |
7
|
|
|
use Pilipinews\Common\Interfaces\ScraperInterface; |
8
|
|
|
use Pilipinews\Common\Scraper as AbstractScraper; |
9
|
|
|
|
10
|
|
|
/** |
11
|
|
|
* ABS-CBN News Scraper |
12
|
|
|
* |
13
|
|
|
* @package Pilipinews |
14
|
|
|
* @author Rougin Gutib <[email protected]> |
15
|
|
|
*/ |
16
|
|
|
class Scraper extends AbstractScraper implements ScraperInterface |
17
|
|
|
{ |
18
|
|
|
/** |
19
|
|
|
* @var string[] |
20
|
|
|
*/ |
21
|
|
|
protected $removables = array('.patrolbox', '.op-related-articles', 'script', '.iwantbar'); |
22
|
|
|
|
23
|
|
|
/** |
24
|
|
|
* @var string[] |
25
|
|
|
*/ |
26
|
|
|
protected $texts = array( |
27
|
|
|
'I-refresh ang pahinang ito para sa updates.', |
28
|
|
|
'I-refresh ang page na ito para sa updates.', |
29
|
|
|
'Refresh this link for more details.', |
30
|
|
|
'I-refresh ang web page na ito para sa mga pinakahuling update.', |
31
|
|
|
'Please refresh this page for updates.', |
32
|
|
|
); |
33
|
|
|
|
34
|
|
|
/** |
35
|
|
|
* Returns the contents of an article. |
36
|
|
|
* |
37
|
|
|
* @param string $link |
38
|
|
|
* @return \Pilipinews\Common\Article |
39
|
|
|
*/ |
40
|
30 |
|
public function scrape($link) |
41
|
|
|
{ |
42
|
30 |
|
$this->prepare(mb_strtolower($link)); |
43
|
|
|
|
44
|
30 |
|
$title = $this->title('h1.news-title'); |
45
|
|
|
|
46
|
30 |
|
$this->remove((array) $this->removables); |
47
|
|
|
|
48
|
30 |
|
$body = $this->body('.article-content'); |
49
|
|
|
|
50
|
30 |
|
$body = $this->album($body); |
51
|
|
|
|
52
|
30 |
|
$body = $this->embedly($body); |
53
|
|
|
|
54
|
30 |
|
$body = $this->image($body); |
55
|
|
|
|
56
|
30 |
|
$body = $this->tweet($body); |
57
|
|
|
|
58
|
30 |
|
$body = $this->video($body); |
59
|
|
|
|
60
|
30 |
|
$body = $this->post($body); |
61
|
|
|
|
62
|
30 |
|
$html = $this->html($body, $this->texts); |
63
|
|
|
|
64
|
30 |
|
$html = htmlspecialchars_decode($html); |
65
|
|
|
|
66
|
30 |
|
return new Article($title, $html, $link); |
67
|
|
|
} |
68
|
|
|
|
69
|
|
|
/** |
70
|
|
|
* Converts an album element into a readable string. |
71
|
|
|
* |
72
|
|
|
* @param \Symfony\Component\DomCrawler\Crawler $crawler |
73
|
|
|
* @return \Symfony\Component\DomCrawler\Crawler |
74
|
|
|
*/ |
75
|
|
|
protected function album(DomCrawler $crawler) |
76
|
|
|
{ |
77
|
30 |
|
$callback = function (DomCrawler $crawler, $html) |
|
|
|
|
78
|
|
|
{ |
79
|
6 |
|
$results = array(); |
80
|
|
|
|
81
|
6 |
|
$pattern = '.slider-for > div > img'; |
82
|
|
|
|
83
|
6 |
|
$items = $crawler->filter($pattern); |
84
|
|
|
|
85
|
6 |
|
$pattern = '.slider-desc > .item-desc > p'; |
86
|
|
|
|
87
|
6 |
|
$texts = $crawler->filter($pattern); |
88
|
|
|
|
89
|
6 |
|
for ($i = 0; $i < $items->count(); $i++) |
90
|
|
|
{ |
91
|
6 |
|
$link = 'PHOTO: ' . $items->eq($i)->attr('src'); |
92
|
|
|
|
93
|
6 |
|
$text = ''; |
94
|
|
|
|
95
|
6 |
|
if ($texts->count() !== 1) |
96
|
|
|
{ |
97
|
3 |
|
$text = $texts->eq($i)->text(); |
98
|
|
|
} |
99
|
|
|
|
100
|
6 |
|
$result = '<p>' . $link . ' - ' . $text . '</p>'; |
101
|
|
|
|
102
|
6 |
|
$results[] = str_replace(' - </', '</', $result); |
103
|
|
|
} |
104
|
|
|
|
105
|
6 |
|
return implode("\n\n", (array) $results); |
106
|
30 |
|
}; |
107
|
|
|
|
108
|
30 |
|
return $this->replace($crawler, '.media-content', $callback); |
109
|
|
|
} |
110
|
|
|
|
111
|
|
|
/** |
112
|
|
|
* Converts an embedly elements into a readable string. |
113
|
|
|
* |
114
|
|
|
* @param \Symfony\Component\DomCrawler\Crawler $crawler |
115
|
|
|
* @return \Symfony\Component\DomCrawler\Crawler |
116
|
|
|
*/ |
117
|
|
|
protected function embedly(DomCrawler $crawler) |
118
|
|
|
{ |
119
|
30 |
|
$callback = function (DomCrawler $crawler) |
120
|
|
|
{ |
121
|
3 |
|
$item = $crawler->filter('a')->first(); |
122
|
|
|
|
123
|
3 |
|
return 'EMBED: ' . $item->attr('href'); |
124
|
30 |
|
}; |
125
|
|
|
|
126
|
30 |
|
return $this->replace($crawler, '.embedly-card', $callback); |
127
|
|
|
} |
128
|
|
|
|
129
|
|
|
/** |
130
|
|
|
* Converts image elements into a readable string. |
131
|
|
|
* |
132
|
|
|
* @param \Symfony\Component\DomCrawler\Crawler $crawler |
133
|
|
|
* @return \Symfony\Component\DomCrawler\Crawler |
134
|
|
|
*/ |
135
|
|
|
protected function image(DomCrawler $crawler) |
136
|
|
|
{ |
137
|
30 |
|
$callback = function (DomCrawler $crawler, $html) |
138
|
|
|
{ |
139
|
6 |
|
$image = 'PHOTO: ' . $crawler->filter('img')->attr('src'); |
140
|
|
|
|
141
|
6 |
|
$image = str_replace('?ext=.jpg', '', (string) $image); |
142
|
|
|
|
143
|
6 |
|
$text = '<p>' . $image . ' - ' . $crawler->text() . '</p>'; |
144
|
|
|
|
145
|
6 |
|
if (strpos($html, '<em>') !== false) |
146
|
|
|
{ |
147
|
3 |
|
$em = $crawler->filter('em')->first()->text(); |
148
|
|
|
|
149
|
3 |
|
$text = str_replace($em, '(' . $em . ')', $text); |
150
|
|
|
} |
151
|
|
|
|
152
|
6 |
|
return str_replace(' - </', '</', (string) $text); |
153
|
30 |
|
}; |
154
|
|
|
|
155
|
30 |
|
return $this->replace($crawler, '.embed-wrap', $callback); |
156
|
|
|
} |
157
|
|
|
|
158
|
|
|
/** |
159
|
|
|
* Converts post elements into a readable string. |
160
|
|
|
* |
161
|
|
|
* @param \Symfony\Component\DomCrawler\Crawler $crawler |
162
|
|
|
* @return \Symfony\Component\DomCrawler\Crawler |
163
|
|
|
*/ |
164
|
|
|
protected function post(DomCrawler $crawler) |
165
|
|
|
{ |
166
|
30 |
|
$callback = function (DomCrawler $node, $html) |
|
|
|
|
167
|
|
|
{ |
168
|
3 |
|
return '<p>POST: ' . $node->attr('data-href') . '</p>'; |
169
|
30 |
|
}; |
170
|
|
|
|
171
|
30 |
|
return $this->replace($crawler, '.fb-post', $callback); |
172
|
|
|
} |
173
|
|
|
|
174
|
|
|
/** |
175
|
|
|
* Converts video elements into a readable string. |
176
|
|
|
* |
177
|
|
|
* @param \Symfony\Component\DomCrawler\Crawler $crawler |
178
|
|
|
* @return \Symfony\Component\DomCrawler\Crawler |
179
|
|
|
*/ |
180
|
|
|
protected function video(DomCrawler $crawler) |
181
|
|
|
{ |
182
|
30 |
|
$callback = function (DomCrawler $crawler) |
183
|
|
|
{ |
184
|
12 |
|
$element = $crawler->filter('iframe'); |
185
|
|
|
|
186
|
12 |
|
$link = $element->attr('src'); |
187
|
|
|
|
188
|
12 |
|
return '<p>VIDEO: ' . $link . '</p>'; |
189
|
30 |
|
}; |
190
|
|
|
|
191
|
|
|
return $this->replace($crawler, '.op-interactive', $callback); |
192
|
|
|
} |
193
|
|
|
} |
194
|
|
|
|
This check looks for parameters that have been defined for a function or method, but which are not used in the method body.