@@ -12,493 +12,493 @@ |
||
12 | 12 | */ |
13 | 13 | class Parser { |
14 | 14 | |
15 | - /** |
|
16 | - * @var ClientInterface |
|
17 | - */ |
|
18 | - private $client; |
|
19 | - |
|
20 | - /** |
|
21 | - * @var array |
|
22 | - */ |
|
23 | - private static $cache; |
|
24 | - |
|
25 | - /** |
|
26 | - * Constructor |
|
27 | - * @param ClientInterface $client HTTP Client |
|
28 | - */ |
|
29 | - public function __construct(ClientInterface $client) { |
|
30 | - $this->client = $client; |
|
31 | - } |
|
32 | - |
|
33 | - /** |
|
34 | - * Parses a URL into a an array of metatags |
|
35 | - * |
|
36 | - * @param string $url URL to parse |
|
37 | - * @return array |
|
38 | - */ |
|
39 | - public function parse($url = '') { |
|
40 | - |
|
41 | - $data = $this->getImageData($url); |
|
42 | - if (!$data) { |
|
43 | - $data = $this->getOEmbedData($url); |
|
44 | - } |
|
45 | - if (!$data) { |
|
46 | - $data = $this->getDOMData($url); |
|
47 | - if (is_array($data) && !empty($data['oembed_url'])) { |
|
48 | - foreach ($data['oembed_url'] as $oembed_url) { |
|
49 | - $oembed_data = $this->parse($oembed_url); |
|
50 | - if (!empty($oembed_data) && is_array($oembed_data)) { |
|
51 | - $oembed_data['oembed_url'] = $oembed_data['url']; |
|
52 | - unset($oembed_data['url']); |
|
53 | - $data = array_merge($data, $oembed_data); |
|
54 | - } |
|
55 | - } |
|
56 | - } |
|
57 | - } |
|
15 | + /** |
|
16 | + * @var ClientInterface |
|
17 | + */ |
|
18 | + private $client; |
|
19 | + |
|
20 | + /** |
|
21 | + * @var array |
|
22 | + */ |
|
23 | + private static $cache; |
|
24 | + |
|
25 | + /** |
|
26 | + * Constructor |
|
27 | + * @param ClientInterface $client HTTP Client |
|
28 | + */ |
|
29 | + public function __construct(ClientInterface $client) { |
|
30 | + $this->client = $client; |
|
31 | + } |
|
32 | + |
|
33 | + /** |
|
34 | + * Parses a URL into a an array of metatags |
|
35 | + * |
|
36 | + * @param string $url URL to parse |
|
37 | + * @return array |
|
38 | + */ |
|
39 | + public function parse($url = '') { |
|
40 | + |
|
41 | + $data = $this->getImageData($url); |
|
42 | + if (!$data) { |
|
43 | + $data = $this->getOEmbedData($url); |
|
44 | + } |
|
45 | + if (!$data) { |
|
46 | + $data = $this->getDOMData($url); |
|
47 | + if (is_array($data) && !empty($data['oembed_url'])) { |
|
48 | + foreach ($data['oembed_url'] as $oembed_url) { |
|
49 | + $oembed_data = $this->parse($oembed_url); |
|
50 | + if (!empty($oembed_data) && is_array($oembed_data)) { |
|
51 | + $oembed_data['oembed_url'] = $oembed_data['url']; |
|
52 | + unset($oembed_data['url']); |
|
53 | + $data = array_merge($data, $oembed_data); |
|
54 | + } |
|
55 | + } |
|
56 | + } |
|
57 | + } |
|
58 | 58 | |
59 | - if (!is_array($data)) { |
|
60 | - $data = array(); |
|
61 | - } |
|
62 | - |
|
63 | - if (empty($data['thumbnail_url']) && !empty($data['thumbnails'])) { |
|
64 | - $data['thumbnail_url'] = $data['thumbnails'][0]; |
|
65 | - } |
|
66 | - |
|
67 | - return $data; |
|
68 | - } |
|
69 | - |
|
70 | - /** |
|
71 | - * Parses image metatags |
|
72 | - * |
|
73 | - * @param string $url URL of the image |
|
74 | - * @return array|false |
|
75 | - */ |
|
76 | - public function getImageData($url = '') { |
|
77 | - if (!$this->isImage($url)) { |
|
78 | - return false; |
|
79 | - } |
|
80 | - |
|
81 | - return array( |
|
82 | - 'type' => 'photo', |
|
83 | - 'url' => $url, |
|
84 | - 'thumbnails' => array($url), |
|
85 | - ); |
|
86 | - } |
|
87 | - |
|
88 | - /** |
|
89 | - * Parses OEmbed data |
|
90 | - * |
|
91 | - * @param string $url URL of the image |
|
92 | - * @return array|false |
|
93 | - */ |
|
94 | - public function getOEmbedData($url = '') { |
|
95 | - |
|
96 | - if (!$this->isJSON($url) && !$this->isXML($url)) { |
|
97 | - return false; |
|
98 | - } |
|
99 | - |
|
100 | - $meta = array( |
|
101 | - 'url' => $url, |
|
102 | - ); |
|
103 | - |
|
104 | - $content = $this->read($url); |
|
105 | - if (!$content) { |
|
106 | - return $meta; |
|
107 | - } |
|
108 | - |
|
109 | - $data = new \stdClass(); |
|
110 | - if ($this->isJSON($url)) { |
|
111 | - $data = json_decode($content); |
|
112 | - } else if ($this->isXML($url)) { |
|
113 | - $data = simplexml_load_string($content); |
|
114 | - } |
|
115 | - |
|
116 | - $props = array( |
|
117 | - 'type', |
|
118 | - 'version', |
|
119 | - 'title', |
|
120 | - 'author_name', |
|
121 | - 'author_url', |
|
122 | - 'provider_name', |
|
123 | - 'provider_url', |
|
124 | - 'cache_age', |
|
125 | - 'thumbnail_url', |
|
126 | - 'thumbnail_width', |
|
127 | - 'thumbnail_height', |
|
128 | - 'width', |
|
129 | - 'height', |
|
130 | - 'html', |
|
131 | - ); |
|
132 | - foreach ($props as $key) { |
|
133 | - if (!empty($data->$key)) { |
|
134 | - $meta[$key] = (string) $data->$key; |
|
135 | - } |
|
136 | - } |
|
137 | - return $meta; |
|
138 | - } |
|
139 | - |
|
140 | - /** |
|
141 | - * Parses metatags from DOM |
|
142 | - * |
|
143 | - * @param string $url URL |
|
144 | - * @return array|false |
|
145 | - */ |
|
146 | - public function getDOMData($url = '') { |
|
147 | - |
|
148 | - if (!$this->isHTML($url)) { |
|
149 | - return false; |
|
150 | - } |
|
151 | - |
|
152 | - $doc = $this->getDOM($url); |
|
153 | - $defaults = array( |
|
154 | - 'url' => $url, |
|
155 | - ); |
|
156 | - |
|
157 | - $link_tags = $this->parseLinkTags($doc); |
|
158 | - $meta_tags = $this->parseMetaTags($doc); |
|
159 | - $img_tags = $this->parseImgTags($doc); |
|
160 | - |
|
161 | - $meta = array_merge_recursive($defaults, $link_tags, $meta_tags, $img_tags); |
|
59 | + if (!is_array($data)) { |
|
60 | + $data = array(); |
|
61 | + } |
|
62 | + |
|
63 | + if (empty($data['thumbnail_url']) && !empty($data['thumbnails'])) { |
|
64 | + $data['thumbnail_url'] = $data['thumbnails'][0]; |
|
65 | + } |
|
66 | + |
|
67 | + return $data; |
|
68 | + } |
|
69 | + |
|
70 | + /** |
|
71 | + * Parses image metatags |
|
72 | + * |
|
73 | + * @param string $url URL of the image |
|
74 | + * @return array|false |
|
75 | + */ |
|
76 | + public function getImageData($url = '') { |
|
77 | + if (!$this->isImage($url)) { |
|
78 | + return false; |
|
79 | + } |
|
80 | + |
|
81 | + return array( |
|
82 | + 'type' => 'photo', |
|
83 | + 'url' => $url, |
|
84 | + 'thumbnails' => array($url), |
|
85 | + ); |
|
86 | + } |
|
87 | + |
|
88 | + /** |
|
89 | + * Parses OEmbed data |
|
90 | + * |
|
91 | + * @param string $url URL of the image |
|
92 | + * @return array|false |
|
93 | + */ |
|
94 | + public function getOEmbedData($url = '') { |
|
95 | + |
|
96 | + if (!$this->isJSON($url) && !$this->isXML($url)) { |
|
97 | + return false; |
|
98 | + } |
|
99 | + |
|
100 | + $meta = array( |
|
101 | + 'url' => $url, |
|
102 | + ); |
|
103 | + |
|
104 | + $content = $this->read($url); |
|
105 | + if (!$content) { |
|
106 | + return $meta; |
|
107 | + } |
|
108 | + |
|
109 | + $data = new \stdClass(); |
|
110 | + if ($this->isJSON($url)) { |
|
111 | + $data = json_decode($content); |
|
112 | + } else if ($this->isXML($url)) { |
|
113 | + $data = simplexml_load_string($content); |
|
114 | + } |
|
115 | + |
|
116 | + $props = array( |
|
117 | + 'type', |
|
118 | + 'version', |
|
119 | + 'title', |
|
120 | + 'author_name', |
|
121 | + 'author_url', |
|
122 | + 'provider_name', |
|
123 | + 'provider_url', |
|
124 | + 'cache_age', |
|
125 | + 'thumbnail_url', |
|
126 | + 'thumbnail_width', |
|
127 | + 'thumbnail_height', |
|
128 | + 'width', |
|
129 | + 'height', |
|
130 | + 'html', |
|
131 | + ); |
|
132 | + foreach ($props as $key) { |
|
133 | + if (!empty($data->$key)) { |
|
134 | + $meta[$key] = (string) $data->$key; |
|
135 | + } |
|
136 | + } |
|
137 | + return $meta; |
|
138 | + } |
|
139 | + |
|
140 | + /** |
|
141 | + * Parses metatags from DOM |
|
142 | + * |
|
143 | + * @param string $url URL |
|
144 | + * @return array|false |
|
145 | + */ |
|
146 | + public function getDOMData($url = '') { |
|
147 | + |
|
148 | + if (!$this->isHTML($url)) { |
|
149 | + return false; |
|
150 | + } |
|
151 | + |
|
152 | + $doc = $this->getDOM($url); |
|
153 | + $defaults = array( |
|
154 | + 'url' => $url, |
|
155 | + ); |
|
156 | + |
|
157 | + $link_tags = $this->parseLinkTags($doc); |
|
158 | + $meta_tags = $this->parseMetaTags($doc); |
|
159 | + $img_tags = $this->parseImgTags($doc); |
|
160 | + |
|
161 | + $meta = array_merge_recursive($defaults, $link_tags, $meta_tags, $img_tags); |
|
162 | 162 | |
163 | - if (empty($meta['title'])) { |
|
164 | - $meta['title'] = $this->parseTitle($doc); |
|
165 | - } |
|
166 | - |
|
167 | - |
|
168 | - return $meta; |
|
169 | - } |
|
170 | - |
|
171 | - /** |
|
172 | - * Check if URL exists and is reachable by making an HTTP request to retrieve header information |
|
173 | - * |
|
174 | - * @param string $url URL of the resource |
|
175 | - * @return boolean |
|
176 | - */ |
|
177 | - public function exists($url = '') { |
|
178 | - $response = $this->request($url); |
|
179 | - if ($response instanceof Response) { |
|
180 | - return $response->getStatusCode() == 200; |
|
181 | - } |
|
182 | - return false; |
|
183 | - } |
|
184 | - |
|
185 | - /** |
|
186 | - * Returns head of the resource |
|
187 | - * |
|
188 | - * @param string $url URL of the resource |
|
189 | - * @return Response|false |
|
190 | - */ |
|
191 | - public function request($url = '') { |
|
192 | - if (!filter_var($url, FILTER_VALIDATE_URL)) { |
|
193 | - return false; |
|
194 | - } |
|
195 | - if (!isset(self::$cache[$url])) { |
|
196 | - try { |
|
197 | - $response = $this->client->request('GET', $url); |
|
198 | - } catch (Exception $e) { |
|
199 | - $response = false; |
|
200 | - error_log("Parser Error for HEAD request ($url): {$e->getMessage()}"); |
|
201 | - } |
|
202 | - self::$cache[$url] = $response; |
|
203 | - } |
|
204 | - |
|
205 | - return self::$cache[$url]; |
|
206 | - } |
|
207 | - |
|
208 | - /** |
|
209 | - * Get contents of the page |
|
210 | - * |
|
211 | - * @param string $url URL of the resource |
|
212 | - * @return string |
|
213 | - */ |
|
214 | - public function read($url = '') { |
|
215 | - $body = ''; |
|
216 | - if (!$this->exists($url)) { |
|
217 | - return $body; |
|
218 | - } |
|
219 | - |
|
220 | - $response = $this->request($url); |
|
221 | - $body = (string) $response->getBody(); |
|
222 | - return $body; |
|
223 | - } |
|
224 | - |
|
225 | - /** |
|
226 | - * Checks if resource is an html page |
|
227 | - * |
|
228 | - * @param string $url URL of the resource |
|
229 | - * @return boolean |
|
230 | - */ |
|
231 | - public function isHTML($url = '') { |
|
232 | - $mime = $this->getContentType($url); |
|
233 | - return strpos($mime, 'text/html') !== false; |
|
234 | - } |
|
235 | - |
|
236 | - /** |
|
237 | - * Checks if resource is JSON |
|
238 | - * |
|
239 | - * @param string $url URL of the resource |
|
240 | - * @return boolean |
|
241 | - */ |
|
242 | - public function isJSON($url = '') { |
|
243 | - $mime = $this->getContentType($url); |
|
244 | - return strpos($mime, 'json') !== false; |
|
245 | - } |
|
246 | - |
|
247 | - /** |
|
248 | - * Checks if resource is XML |
|
249 | - * |
|
250 | - * @param string $url URL of the resource |
|
251 | - * @return boolean |
|
252 | - */ |
|
253 | - public function isXML($url = '') { |
|
254 | - $mime = $this->getContentType($url); |
|
255 | - return strpos($mime, 'xml') !== false; |
|
256 | - } |
|
257 | - |
|
258 | - /** |
|
259 | - * Checks if resource is an image |
|
260 | - * |
|
261 | - * @param string $url URL of the resource |
|
262 | - * @return boolean |
|
263 | - */ |
|
264 | - public function isImage($url = '') { |
|
265 | - $mime = $this->getContentType($url); |
|
266 | - if ($mime) { |
|
267 | - list($simple, ) = explode('/', $mime); |
|
268 | - return ($simple == 'image'); |
|
269 | - } |
|
270 | - |
|
271 | - return false; |
|
272 | - } |
|
273 | - |
|
274 | - /** |
|
275 | - * Get mime type of the URL content |
|
276 | - * |
|
277 | - * @param string $url URL of the resource |
|
278 | - * @return string |
|
279 | - */ |
|
280 | - public function getContentType($url = '') { |
|
281 | - $response = $this->request($url); |
|
282 | - if ($response instanceof Response) { |
|
283 | - $header = $response->getHeader('Content-Type'); |
|
284 | - if (is_array($header) && !empty($header)) { |
|
285 | - $parts = explode(';', $header[0]); |
|
286 | - return trim($parts[0]); |
|
287 | - } |
|
288 | - } |
|
289 | - return ''; |
|
290 | - } |
|
291 | - |
|
292 | - /** |
|
293 | - * Returns HTML contents of the page |
|
294 | - * |
|
295 | - * @param string $url URL of the resource |
|
296 | - * @return string |
|
297 | - */ |
|
298 | - public function getHTML($url = '') { |
|
299 | - if (!$this->isHTML($url)) { |
|
300 | - return ''; |
|
301 | - } |
|
302 | - return $this->read($url); |
|
303 | - } |
|
304 | - |
|
305 | - /** |
|
306 | - * Returns HTML contents of the page as a DOMDocument |
|
307 | - * |
|
308 | - * @param string $url URL of the resource |
|
309 | - * @return DOMDocument |
|
310 | - */ |
|
311 | - public function getDOM($url = '') { |
|
312 | - $html = $this->getHTML($url); |
|
313 | - $doc = new DOMDocument(); |
|
314 | - $doc->loadHTML($html); |
|
315 | - if (!$doc->documentURI) { |
|
316 | - $doc->documentURI = $url; |
|
317 | - } |
|
318 | - return $doc; |
|
319 | - } |
|
320 | - |
|
321 | - /** |
|
322 | - * Parses document title |
|
323 | - * |
|
324 | - * @param DOMDocument $doc Document |
|
325 | - * @return string |
|
326 | - */ |
|
327 | - public function parseTitle(DOMDocument $doc) { |
|
328 | - $node = $doc->getElementsByTagName('title'); |
|
329 | - $title = $node->item(0)->nodeValue; |
|
330 | - return ($title) ? : ''; |
|
331 | - } |
|
332 | - |
|
333 | - /** |
|
334 | - * Parses <link> tags |
|
335 | - * |
|
336 | - * @param DOMDocument $doc Document |
|
337 | - * @return array |
|
338 | - */ |
|
339 | - public function parseLinkTags(DOMDocument $doc) { |
|
340 | - |
|
341 | - $meta = array(); |
|
342 | - |
|
343 | - $nodes = $doc->getElementsByTagName('link'); |
|
344 | - foreach ($nodes as $node) { |
|
345 | - $rel = $node->getAttribute('rel'); |
|
346 | - $href = $node->getAttribute('href'); |
|
347 | - |
|
348 | - switch ($rel) { |
|
349 | - |
|
350 | - case 'icon' : |
|
351 | - $meta['icons'][] = $this->getAbsoluteURL($doc, $href); |
|
352 | - break; |
|
353 | - |
|
354 | - case 'canonical' : |
|
355 | - $meta['canonical'] = $this->getAbsoluteURL($doc, $href); |
|
356 | - break; |
|
357 | - |
|
358 | - case 'alternate' : |
|
359 | - $type = $node->getAttribute('type'); |
|
360 | - if (in_array($type, array( |
|
361 | - 'application/json+oembed', |
|
362 | - 'text/json+oembed', |
|
363 | - 'application/xml+oembed', |
|
364 | - 'text/xml+oembed' |
|
365 | - ))) { |
|
366 | - $meta['oembed_url'][] = $this->getAbsoluteURL($doc, $href); |
|
367 | - } |
|
368 | - break; |
|
369 | - } |
|
370 | - } |
|
371 | - |
|
372 | - return $meta; |
|
373 | - } |
|
374 | - |
|
375 | - /** |
|
376 | - * Parses <meta> tags |
|
377 | - * |
|
378 | - * @param DOMDocument $doc Document |
|
379 | - * @return array |
|
380 | - */ |
|
381 | - public function parseMetaTags(DOMDocument $doc) { |
|
382 | - |
|
383 | - $meta = array(); |
|
384 | - |
|
385 | - $nodes = $doc->getElementsByTagName('meta'); |
|
386 | - if (!empty($nodes)) { |
|
387 | - foreach ($nodes as $node) { |
|
388 | - $name = $node->getAttribute('name'); |
|
389 | - if (!$name) { |
|
390 | - $name = $node->getAttribute('property'); |
|
391 | - } |
|
392 | - if (!$name) { |
|
393 | - continue; |
|
394 | - } |
|
395 | - |
|
396 | - $name = strtolower($name); |
|
397 | - |
|
398 | - $content = $node->getAttribute('content'); |
|
399 | - if (isset($meta['metatags'][$name])) { |
|
400 | - if (!is_array($meta['metatags'][$name])) { |
|
401 | - $meta['metatags'][$name] = array($meta['metatags'][$name]); |
|
402 | - } |
|
403 | - $meta['metatags'][$name][] = $content; |
|
404 | - } else { |
|
405 | - $meta['metatags'][$name] = $content; |
|
406 | - } |
|
407 | - |
|
408 | - switch ($name) { |
|
409 | - |
|
410 | - case 'title' : |
|
411 | - case 'og:title' : |
|
412 | - case 'twitter:title' : |
|
413 | - if (empty($meta['title'])) { |
|
414 | - $meta['title'] = $content; |
|
415 | - } |
|
416 | - break; |
|
417 | - |
|
418 | - case 'og:type' : |
|
419 | - if (empty($meta['type'])) { |
|
420 | - $meta['type'] = $content; |
|
421 | - } |
|
422 | - break; |
|
423 | - |
|
424 | - case 'description' : |
|
425 | - case 'og:description' : |
|
426 | - case 'twitter:description' : |
|
427 | - if (empty($meta['description'])) { |
|
428 | - $meta['description'] = $content; |
|
429 | - } |
|
430 | - break; |
|
431 | - |
|
432 | - case 'keywords' : |
|
433 | - if (is_string($content)) { |
|
434 | - $content = explode(',', $content); |
|
435 | - $content = array_map('trim', $content); |
|
436 | - } |
|
437 | - $meta['tags'] = $content; |
|
438 | - break; |
|
439 | - |
|
440 | - case 'og:site_name' : |
|
441 | - case 'twitter:site' : |
|
442 | - if (empty($meta['provider_name'])) { |
|
443 | - $meta['provider_name'] = $content; |
|
444 | - } |
|
445 | - break; |
|
446 | - |
|
447 | - case 'og:image' : |
|
448 | - case 'twitter:image' : |
|
449 | - $meta['thumbnails'][] = $this->getAbsoluteURL($doc, $content); |
|
450 | - break; |
|
451 | - } |
|
452 | - } |
|
453 | - } |
|
454 | - |
|
455 | - return $meta; |
|
456 | - } |
|
457 | - |
|
458 | - /** |
|
459 | - * Parses <img> tags |
|
460 | - * |
|
461 | - * @param DOMDocument $doc Document |
|
462 | - * @return array |
|
463 | - */ |
|
464 | - public function parseImgTags(DOMDocument $doc) { |
|
465 | - |
|
466 | - $meta = array(); |
|
467 | - |
|
468 | - $nodes = $doc->getElementsByTagName('img'); |
|
469 | - foreach ($nodes as $node) { |
|
470 | - $src = $node->getAttribute('src'); |
|
471 | - $meta['thumbnails'][] = $this->getAbsoluteURL($doc, $src); |
|
472 | - } |
|
473 | - |
|
474 | - return $meta; |
|
475 | - } |
|
476 | - |
|
477 | - /** |
|
478 | - * Normalizes relative URLs |
|
479 | - * |
|
480 | - * @param DOMDocument $doc Document |
|
481 | - * @param string $href URL to normalize |
|
482 | - * @return string |
|
483 | - */ |
|
484 | - public function getAbsoluteURL(DOMDocument $doc, $href = '') { |
|
485 | - |
|
486 | - // Check if $url is absolute |
|
487 | - if (parse_url($href, PHP_URL_HOST)) { |
|
488 | - return $href; |
|
489 | - } |
|
490 | - |
|
491 | - $uri = trim($doc->documentURI ? : '', '/'); |
|
492 | - |
|
493 | - // Check if $url is relative to root |
|
494 | - if (substr($href, 0, 1) === "/") { |
|
495 | - $scheme = parse_url($uri, PHP_URL_SCHEME); |
|
496 | - $host = parse_url($uri, PHP_URL_HOST); |
|
497 | - return "$scheme://$host$href"; |
|
498 | - } |
|
499 | - |
|
500 | - // $url is relative to page |
|
501 | - return "$uri/$href"; |
|
502 | - } |
|
163 | + if (empty($meta['title'])) { |
|
164 | + $meta['title'] = $this->parseTitle($doc); |
|
165 | + } |
|
166 | + |
|
167 | + |
|
168 | + return $meta; |
|
169 | + } |
|
170 | + |
|
171 | + /** |
|
172 | + * Check if URL exists and is reachable by making an HTTP request to retrieve header information |
|
173 | + * |
|
174 | + * @param string $url URL of the resource |
|
175 | + * @return boolean |
|
176 | + */ |
|
177 | + public function exists($url = '') { |
|
178 | + $response = $this->request($url); |
|
179 | + if ($response instanceof Response) { |
|
180 | + return $response->getStatusCode() == 200; |
|
181 | + } |
|
182 | + return false; |
|
183 | + } |
|
184 | + |
|
185 | + /** |
|
186 | + * Returns head of the resource |
|
187 | + * |
|
188 | + * @param string $url URL of the resource |
|
189 | + * @return Response|false |
|
190 | + */ |
|
191 | + public function request($url = '') { |
|
192 | + if (!filter_var($url, FILTER_VALIDATE_URL)) { |
|
193 | + return false; |
|
194 | + } |
|
195 | + if (!isset(self::$cache[$url])) { |
|
196 | + try { |
|
197 | + $response = $this->client->request('GET', $url); |
|
198 | + } catch (Exception $e) { |
|
199 | + $response = false; |
|
200 | + error_log("Parser Error for HEAD request ($url): {$e->getMessage()}"); |
|
201 | + } |
|
202 | + self::$cache[$url] = $response; |
|
203 | + } |
|
204 | + |
|
205 | + return self::$cache[$url]; |
|
206 | + } |
|
207 | + |
|
208 | + /** |
|
209 | + * Get contents of the page |
|
210 | + * |
|
211 | + * @param string $url URL of the resource |
|
212 | + * @return string |
|
213 | + */ |
|
214 | + public function read($url = '') { |
|
215 | + $body = ''; |
|
216 | + if (!$this->exists($url)) { |
|
217 | + return $body; |
|
218 | + } |
|
219 | + |
|
220 | + $response = $this->request($url); |
|
221 | + $body = (string) $response->getBody(); |
|
222 | + return $body; |
|
223 | + } |
|
224 | + |
|
225 | + /** |
|
226 | + * Checks if resource is an html page |
|
227 | + * |
|
228 | + * @param string $url URL of the resource |
|
229 | + * @return boolean |
|
230 | + */ |
|
231 | + public function isHTML($url = '') { |
|
232 | + $mime = $this->getContentType($url); |
|
233 | + return strpos($mime, 'text/html') !== false; |
|
234 | + } |
|
235 | + |
|
236 | + /** |
|
237 | + * Checks if resource is JSON |
|
238 | + * |
|
239 | + * @param string $url URL of the resource |
|
240 | + * @return boolean |
|
241 | + */ |
|
242 | + public function isJSON($url = '') { |
|
243 | + $mime = $this->getContentType($url); |
|
244 | + return strpos($mime, 'json') !== false; |
|
245 | + } |
|
246 | + |
|
247 | + /** |
|
248 | + * Checks if resource is XML |
|
249 | + * |
|
250 | + * @param string $url URL of the resource |
|
251 | + * @return boolean |
|
252 | + */ |
|
253 | + public function isXML($url = '') { |
|
254 | + $mime = $this->getContentType($url); |
|
255 | + return strpos($mime, 'xml') !== false; |
|
256 | + } |
|
257 | + |
|
258 | + /** |
|
259 | + * Checks if resource is an image |
|
260 | + * |
|
261 | + * @param string $url URL of the resource |
|
262 | + * @return boolean |
|
263 | + */ |
|
264 | + public function isImage($url = '') { |
|
265 | + $mime = $this->getContentType($url); |
|
266 | + if ($mime) { |
|
267 | + list($simple, ) = explode('/', $mime); |
|
268 | + return ($simple == 'image'); |
|
269 | + } |
|
270 | + |
|
271 | + return false; |
|
272 | + } |
|
273 | + |
|
274 | + /** |
|
275 | + * Get mime type of the URL content |
|
276 | + * |
|
277 | + * @param string $url URL of the resource |
|
278 | + * @return string |
|
279 | + */ |
|
280 | + public function getContentType($url = '') { |
|
281 | + $response = $this->request($url); |
|
282 | + if ($response instanceof Response) { |
|
283 | + $header = $response->getHeader('Content-Type'); |
|
284 | + if (is_array($header) && !empty($header)) { |
|
285 | + $parts = explode(';', $header[0]); |
|
286 | + return trim($parts[0]); |
|
287 | + } |
|
288 | + } |
|
289 | + return ''; |
|
290 | + } |
|
291 | + |
|
292 | + /** |
|
293 | + * Returns HTML contents of the page |
|
294 | + * |
|
295 | + * @param string $url URL of the resource |
|
296 | + * @return string |
|
297 | + */ |
|
298 | + public function getHTML($url = '') { |
|
299 | + if (!$this->isHTML($url)) { |
|
300 | + return ''; |
|
301 | + } |
|
302 | + return $this->read($url); |
|
303 | + } |
|
304 | + |
|
305 | + /** |
|
306 | + * Returns HTML contents of the page as a DOMDocument |
|
307 | + * |
|
308 | + * @param string $url URL of the resource |
|
309 | + * @return DOMDocument |
|
310 | + */ |
|
311 | + public function getDOM($url = '') { |
|
312 | + $html = $this->getHTML($url); |
|
313 | + $doc = new DOMDocument(); |
|
314 | + $doc->loadHTML($html); |
|
315 | + if (!$doc->documentURI) { |
|
316 | + $doc->documentURI = $url; |
|
317 | + } |
|
318 | + return $doc; |
|
319 | + } |
|
320 | + |
|
321 | + /** |
|
322 | + * Parses document title |
|
323 | + * |
|
324 | + * @param DOMDocument $doc Document |
|
325 | + * @return string |
|
326 | + */ |
|
327 | + public function parseTitle(DOMDocument $doc) { |
|
328 | + $node = $doc->getElementsByTagName('title'); |
|
329 | + $title = $node->item(0)->nodeValue; |
|
330 | + return ($title) ? : ''; |
|
331 | + } |
|
332 | + |
|
333 | + /** |
|
334 | + * Parses <link> tags |
|
335 | + * |
|
336 | + * @param DOMDocument $doc Document |
|
337 | + * @return array |
|
338 | + */ |
|
339 | + public function parseLinkTags(DOMDocument $doc) { |
|
340 | + |
|
341 | + $meta = array(); |
|
342 | + |
|
343 | + $nodes = $doc->getElementsByTagName('link'); |
|
344 | + foreach ($nodes as $node) { |
|
345 | + $rel = $node->getAttribute('rel'); |
|
346 | + $href = $node->getAttribute('href'); |
|
347 | + |
|
348 | + switch ($rel) { |
|
349 | + |
|
350 | + case 'icon' : |
|
351 | + $meta['icons'][] = $this->getAbsoluteURL($doc, $href); |
|
352 | + break; |
|
353 | + |
|
354 | + case 'canonical' : |
|
355 | + $meta['canonical'] = $this->getAbsoluteURL($doc, $href); |
|
356 | + break; |
|
357 | + |
|
358 | + case 'alternate' : |
|
359 | + $type = $node->getAttribute('type'); |
|
360 | + if (in_array($type, array( |
|
361 | + 'application/json+oembed', |
|
362 | + 'text/json+oembed', |
|
363 | + 'application/xml+oembed', |
|
364 | + 'text/xml+oembed' |
|
365 | + ))) { |
|
366 | + $meta['oembed_url'][] = $this->getAbsoluteURL($doc, $href); |
|
367 | + } |
|
368 | + break; |
|
369 | + } |
|
370 | + } |
|
371 | + |
|
372 | + return $meta; |
|
373 | + } |
|
374 | + |
|
375 | + /** |
|
376 | + * Parses <meta> tags |
|
377 | + * |
|
378 | + * @param DOMDocument $doc Document |
|
379 | + * @return array |
|
380 | + */ |
|
381 | + public function parseMetaTags(DOMDocument $doc) { |
|
382 | + |
|
383 | + $meta = array(); |
|
384 | + |
|
385 | + $nodes = $doc->getElementsByTagName('meta'); |
|
386 | + if (!empty($nodes)) { |
|
387 | + foreach ($nodes as $node) { |
|
388 | + $name = $node->getAttribute('name'); |
|
389 | + if (!$name) { |
|
390 | + $name = $node->getAttribute('property'); |
|
391 | + } |
|
392 | + if (!$name) { |
|
393 | + continue; |
|
394 | + } |
|
395 | + |
|
396 | + $name = strtolower($name); |
|
397 | + |
|
398 | + $content = $node->getAttribute('content'); |
|
399 | + if (isset($meta['metatags'][$name])) { |
|
400 | + if (!is_array($meta['metatags'][$name])) { |
|
401 | + $meta['metatags'][$name] = array($meta['metatags'][$name]); |
|
402 | + } |
|
403 | + $meta['metatags'][$name][] = $content; |
|
404 | + } else { |
|
405 | + $meta['metatags'][$name] = $content; |
|
406 | + } |
|
407 | + |
|
408 | + switch ($name) { |
|
409 | + |
|
410 | + case 'title' : |
|
411 | + case 'og:title' : |
|
412 | + case 'twitter:title' : |
|
413 | + if (empty($meta['title'])) { |
|
414 | + $meta['title'] = $content; |
|
415 | + } |
|
416 | + break; |
|
417 | + |
|
418 | + case 'og:type' : |
|
419 | + if (empty($meta['type'])) { |
|
420 | + $meta['type'] = $content; |
|
421 | + } |
|
422 | + break; |
|
423 | + |
|
424 | + case 'description' : |
|
425 | + case 'og:description' : |
|
426 | + case 'twitter:description' : |
|
427 | + if (empty($meta['description'])) { |
|
428 | + $meta['description'] = $content; |
|
429 | + } |
|
430 | + break; |
|
431 | + |
|
432 | + case 'keywords' : |
|
433 | + if (is_string($content)) { |
|
434 | + $content = explode(',', $content); |
|
435 | + $content = array_map('trim', $content); |
|
436 | + } |
|
437 | + $meta['tags'] = $content; |
|
438 | + break; |
|
439 | + |
|
440 | + case 'og:site_name' : |
|
441 | + case 'twitter:site' : |
|
442 | + if (empty($meta['provider_name'])) { |
|
443 | + $meta['provider_name'] = $content; |
|
444 | + } |
|
445 | + break; |
|
446 | + |
|
447 | + case 'og:image' : |
|
448 | + case 'twitter:image' : |
|
449 | + $meta['thumbnails'][] = $this->getAbsoluteURL($doc, $content); |
|
450 | + break; |
|
451 | + } |
|
452 | + } |
|
453 | + } |
|
454 | + |
|
455 | + return $meta; |
|
456 | + } |
|
457 | + |
|
458 | + /** |
|
459 | + * Parses <img> tags |
|
460 | + * |
|
461 | + * @param DOMDocument $doc Document |
|
462 | + * @return array |
|
463 | + */ |
|
464 | + public function parseImgTags(DOMDocument $doc) { |
|
465 | + |
|
466 | + $meta = array(); |
|
467 | + |
|
468 | + $nodes = $doc->getElementsByTagName('img'); |
|
469 | + foreach ($nodes as $node) { |
|
470 | + $src = $node->getAttribute('src'); |
|
471 | + $meta['thumbnails'][] = $this->getAbsoluteURL($doc, $src); |
|
472 | + } |
|
473 | + |
|
474 | + return $meta; |
|
475 | + } |
|
476 | + |
|
477 | + /** |
|
478 | + * Normalizes relative URLs |
|
479 | + * |
|
480 | + * @param DOMDocument $doc Document |
|
481 | + * @param string $href URL to normalize |
|
482 | + * @return string |
|
483 | + */ |
|
484 | + public function getAbsoluteURL(DOMDocument $doc, $href = '') { |
|
485 | + |
|
486 | + // Check if $url is absolute |
|
487 | + if (parse_url($href, PHP_URL_HOST)) { |
|
488 | + return $href; |
|
489 | + } |
|
490 | + |
|
491 | + $uri = trim($doc->documentURI ? : '', '/'); |
|
492 | + |
|
493 | + // Check if $url is relative to root |
|
494 | + if (substr($href, 0, 1) === "/") { |
|
495 | + $scheme = parse_url($uri, PHP_URL_SCHEME); |
|
496 | + $host = parse_url($uri, PHP_URL_HOST); |
|
497 | + return "$scheme://$host$href"; |
|
498 | + } |
|
499 | + |
|
500 | + // $url is relative to page |
|
501 | + return "$uri/$href"; |
|
502 | + } |
|
503 | 503 | |
504 | 504 | } |