Passed
Push — master ( 507238...6290b2 )
by Ismayil
04:35 queued 02:12
created
src/hypeJunction/Parser.php 3 patches
Doc Comments   +1 added lines, -1 removed lines patch added patch discarded remove patch
@@ -339,7 +339,7 @@
 block discarded – undo
339 339
 	 * Returns HTML contents of the page as a DOMDocument
340 340
 	 *
341 341
 	 * @param string $url URL of the resource
342
-	 * @return DOMDocument|false
342
+	 * @return DOMDocument
343 343
 	 */
344 344
 	public function getDOM($url = '') {
345 345
 		$html = $this->getHTML($url);
Please login to merge, or discard this patch.
Spacing   +1 added lines, -1 removed lines patch added patch discarded remove patch
@@ -297,7 +297,7 @@
 block discarded – undo
297 297
 	public function isImage($url = '') {
298 298
 		$mime = $this->getContentType($url);
299 299
 		if ($mime) {
300
-			list($simple, ) = explode('/', $mime);
300
+			list($simple,) = explode('/', $mime);
301 301
 			return ($simple == 'image');
302 302
 		}
303 303
 
Please login to merge, or discard this patch.
Indentation   +564 added lines, -564 removed lines patch added patch discarded remove patch
@@ -12,569 +12,569 @@
 block discarded – undo
12 12
  */
13 13
 class Parser {
14 14
 
15
-	/**
16
-	 * @var ClientInterface
17
-	 */
18
-	private $client;
19
-
20
-	/**
21
-	 * @var array
22
-	 */
23
-	private static $cache;
24
-
25
-	/**
26
-	 * @var array
27
-	 */
28
-	private $urls = [];
29
-
30
-	/**
31
-	 * Constructor
32
-	 * @param ClientInterface $client HTTP Client
33
-	 */
34
-	public function __construct(ClientInterface $client) {
35
-		$this->client = $client;
36
-	}
37
-
38
-	/**
39
-	 * Parses a URL into a an array of metatags
40
-	 *
41
-	 * @param string $url URL to parse
42
-	 * @return array
43
-	 */
44
-	public function parse($url = '') {
45
-		$data = $this->getImageData($url);
46
-		if (!$data) {
47
-			$data = $this->getOEmbedData($url);
48
-		}
49
-		if (!$data) {
50
-			$data = $this->getDOMData($url);
51
-			if (is_array($data) && !empty($data['oembed_url'])) {
52
-				foreach ($data['oembed_url'] as $oembed_url) {
53
-					$oembed_data = $this->getOEmbedData($oembed_url);
54
-					if (!empty($oembed_data) && is_array($oembed_data)) {
55
-						$oembed_data['oembed_url'] = $oembed_data['url'];
56
-						unset($oembed_data['url']);
57
-						$data = array_merge($data, $oembed_data);
58
-					}
59
-				}
60
-			}
61
-		}
62
-
63
-		if (!is_array($data)) {
64
-			$data = array();
65
-		}
66
-
67
-		if (empty($data['thumbnail_url']) && !empty($data['thumbnails'])) {
68
-			$data['thumbnail_url'] = $data['thumbnails'][0];
69
-		}
70
-
71
-		return $data;
72
-	}
73
-
74
-	/**
75
-	 * Parses image metatags
76
-	 *
77
-	 * @param string $url URL of the image
78
-	 * @return array|false
79
-	 */
80
-	public function getImageData($url = '') {
81
-		if (!$this->isImage($url)) {
82
-			return false;
83
-		}
84
-
85
-		return array(
86
-			'type' => 'photo',
87
-			'url' => $url,
88
-			'thumbnails' => array($url),
89
-		);
90
-	}
91
-
92
-	/**
93
-	 * Parses OEmbed data
94
-	 *
95
-	 * @param  string $url URL of the image
96
-	 * @return array|false
97
-	 */
98
-	public function getOEmbedData($url = '') {
99
-
100
-		if (!$this->isJSON($url) && !$this->isXML($url)) {
101
-			return false;
102
-		}
103
-
104
-		$meta = array(
105
-			'url' => $url,
106
-		);
107
-
108
-		$content = $this->read($url);
109
-		if (!$content) {
110
-			return $meta;
111
-		}
112
-
113
-		$data = new \stdClass();
114
-		if ($this->isJSON($url)) {
115
-			$data = json_decode($content);
116
-		} else if ($this->isXML($url)) {
117
-			$data = simplexml_load_string($content);
118
-		}
119
-
120
-		$props = array(
121
-			'type',
122
-			'version',
123
-			'title',
124
-			'author_name',
125
-			'author_url',
126
-			'provider_name',
127
-			'provider_url',
128
-			'cache_age',
129
-			'thumbnail_url',
130
-			'thumbnail_width',
131
-			'thumbnail_height',
132
-			'width',
133
-			'height',
134
-			'html',
135
-		);
136
-		foreach ($props as $key) {
137
-			if (!empty($data->$key)) {
138
-				$meta[$key] = (string) $data->$key;
139
-			}
140
-		}
141
-		return $meta;
142
-	}
143
-
144
-	/**
145
-	 * Parses metatags from DOM
146
-	 *
147
-	 * @param  string $url URL
148
-	 * @return array|false
149
-	 */
150
-	public function getDOMData($url = '') {
151
-
152
-		if (!$this->isHTML($url)) {
153
-			return false;
154
-		}
155
-
156
-		$doc = $this->getDOM($url);
157
-		if (!$doc) {
158
-			return false;
159
-		}
160
-
161
-		$defaults = array(
162
-			'url' => $url,
163
-		);
164
-
165
-		$link_tags = $this->parseLinkTags($doc);
166
-		$meta_tags = $this->parseMetaTags($doc);
167
-		$img_tags = $this->parseImgTags($doc);
168
-
169
-		$meta = array_merge_recursive($defaults, $link_tags, $meta_tags, $img_tags);
170
-
171
-		if (empty($meta['title'])) {
172
-			$meta['title'] = $this->parseTitle($doc);
173
-		}
174
-
175
-
176
-		return $meta;
177
-	}
178
-
179
-	/**
180
-	 * Check if URL exists and is reachable by making an HTTP request to retrieve header information
181
-	 *
182
-	 * @param string $url URL of the resource
183
-	 * @return boolean
184
-	 */
185
-	public function exists($url = '') {
186
-		$response = $this->request($url);
187
-		if ($response instanceof Response) {
188
-			return $response->getStatusCode() == 200;
189
-		}
190
-		return false;
191
-	}
192
-
193
-	/**
194
-	 * Validate URL
195
-	 *
196
-	 * @param string $url URL to validate
197
-	 * @return bool
198
-	 */
199
-	public function isValidUrl($url = '') {
200
-		// based on http://php.net/manual/en/function.filter-var.php#104160
201
-		// adapted by @mrclay in https://github.com/mrclay/Elgg-leaf/blob/62bf31c0ccdaab549a7e585a4412443e09821db3/engine/lib/output.php
202
-		$res = filter_var($url, FILTER_VALIDATE_URL);
203
-		if ($res) {
204
-			return $res;
205
-		}
206
-		// Check if it has unicode chars.
207
-		$l = mb_strlen($url);
208
-		if (strlen($url) == $l) {
209
-			return $res;
210
-		}
211
-		// Replace wide chars by “X”.
212
-		$s = '';
213
-		for ($i = 0; $i < $l; ++$i) {
214
-			$ch = elgg_substr($url, $i, 1);
215
-			$s .= (strlen($ch) > 1) ? 'X' : $ch;
216
-		}
217
-		// Re-check now.
218
-		return filter_var($s, FILTER_VALIDATE_URL) ? $url : false;
219
-	}
220
-
221
-	/**
222
-	 * Returns head of the resource
223
-	 *
224
-	 * @param string $url URL of the resource
225
-	 * @return Response|false
226
-	 */
227
-	public function request($url = '') {
228
-		$url = str_replace(' ', '%20', $url);
229
-		if (!$this->isValidUrl($url)) {
230
-			return false;
231
-		}
232
-		if (!isset(self::$cache[$url])) {
233
-			try {
234
-				$response = $this->client->request('GET', $url);
235
-			} catch (Exception $e) {
236
-				$response = false;
237
-				error_log("Parser Error for HEAD request ($url): {$e->getMessage()}");
238
-			}
239
-			self::$cache[$url] = $response;
240
-		}
241
-
242
-		return self::$cache[$url];
243
-	}
244
-
245
-	/**
246
-	 * Get contents of the page
247
-	 *
248
-	 * @param string $url URL of the resource
249
-	 * @return string
250
-	 */
251
-	public function read($url = '') {
252
-		$body = '';
253
-		if (!$this->exists($url)) {
254
-			return $body;
255
-		}
256
-
257
-		$response = $this->request($url);
258
-		$body = (string) $response->getBody();
259
-		return $body;
260
-	}
261
-
262
-	/**
263
-	 * Checks if resource is an html page
264
-	 *
265
-	 * @param string $url URL of the resource
266
-	 * @return boolean
267
-	 */
268
-	public function isHTML($url = '') {
269
-		$mime = $this->getContentType($url);
270
-		return strpos($mime, 'text/html') !== false;
271
-	}
272
-
273
-	/**
274
-	 * Checks if resource is JSON
275
-	 *
276
-	 * @param string $url URL of the resource
277
-	 * @return boolean
278
-	 */
279
-	public function isJSON($url = '') {
280
-		$mime = $this->getContentType($url);
281
-		return strpos($mime, 'json') !== false;
282
-	}
283
-
284
-	/**
285
-	 * Checks if resource is XML
286
-	 *
287
-	 * @param string $url URL of the resource
288
-	 * @return boolean
289
-	 */
290
-	public function isXML($url = '') {
291
-		$mime = $this->getContentType($url);
292
-		return strpos($mime, 'xml') !== false;
293
-	}
294
-
295
-	/**
296
-	 * Checks if resource is an image
297
-	 *
298
-	 * @param string $url URL of the resource
299
-	 * @return boolean
300
-	 */
301
-	public function isImage($url = '') {
302
-		$mime = $this->getContentType($url);
303
-		if ($mime) {
304
-			list($simple, ) = explode('/', $mime);
305
-			return ($simple == 'image');
306
-		}
307
-
308
-		return false;
309
-	}
310
-
311
-	/**
312
-	 * Get mime type of the URL content
313
-	 *
314
-	 * @param string $url URL of the resource
315
-	 * @return string
316
-	 */
317
-	public function getContentType($url = '') {
318
-		$response = $this->request($url);
319
-		if ($response instanceof Response) {
320
-			$header = $response->getHeader('Content-Type');
321
-			if (is_array($header) && !empty($header)) {
322
-				$parts = explode(';', $header[0]);
323
-				return trim($parts[0]);
324
-			}
325
-		}
326
-		return '';
327
-	}
328
-
329
-	/**
330
-	 * Returns HTML contents of the page
331
-	 *
332
-	 * @param string $url URL of the resource
333
-	 * @return string
334
-	 */
335
-	public function getHTML($url = '') {
336
-		if (!$this->isHTML($url)) {
337
-			return '';
338
-		}
339
-		return $this->read($url);
340
-	}
341
-
342
-	/**
343
-	 * Returns HTML contents of the page as a DOMDocument
344
-	 *
345
-	 * @param string $url URL of the resource
346
-	 * @return DOMDocument|false
347
-	 */
348
-	public function getDOM($url = '') {
349
-		$html = $this->getHTML($url);
350
-		if (empty($html)) {
351
-			return false;
352
-		}
353
-		$doc = new DOMDocument();
354
-
355
-		libxml_use_internal_errors(true);
356
-
357
-		if (is_callable('mb_convert_encoding')) {
358
-			$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
359
-		} else {
360
-			$doc->loadHTML($html);
361
-		}
362
-		if (!$doc->documentURI) {
363
-			$doc->documentURI = $url;
364
-		}
365
-
366
-		libxml_clear_errors();
367
-
368
-		return $doc;
369
-	}
370
-
371
-	/**
372
-	 * Parses document title
373
-	 *
374
-	 * @param DOMDocument $doc Document
375
-	 * @return string
376
-	 */
377
-	public function parseTitle(DOMDocument $doc) {
378
-		$node = $doc->getElementsByTagName('title');
379
-		$title = $node->item(0)->nodeValue;
380
-		return ($title) ?: '';
381
-	}
382
-
383
-	/**
384
-	 * Parses <link> tags
385
-	 *
386
-	 * @param DOMDocument $doc Document
387
-	 * @return array
388
-	 */
389
-	public function parseLinkTags(DOMDocument $doc) {
390
-
391
-		$meta = array(
392
-			'icons' => [],
393
-			'thumbnails' => [],
394
-		);
395
-
396
-		$nodes = $doc->getElementsByTagName('link');
397
-		foreach ($nodes as $node) {
398
-			$rel = $node->getAttribute('rel');
399
-			$href = $node->getAttribute('href');
400
-
401
-			switch ($rel) {
402
-
403
-				case 'icon' :
404
-					$image_url = $this->getAbsoluteURL($doc, $href);
405
-					if ($this->isImage($image_url)) {
406
-						$meta['icons'][] = $image_url;
407
-					}
408
-					break;
409
-
410
-				case 'canonical' :
411
-					$meta['canonical'] = $this->getAbsoluteURL($doc, $href);
412
-					break;
413
-
414
-				case 'alternate' :
415
-					$type = $node->getAttribute('type');
416
-					if (in_array($type, array(
417
-						'application/json+oembed',
418
-						'text/json+oembed',
419
-						'application/xml+oembed',
420
-						'text/xml+oembed'
421
-					))) {
422
-						$meta['oembed_url'][] = $this->getAbsoluteURL($doc, $href);
423
-					}
424
-					break;
425
-			}
426
-		}
427
-
428
-		return $meta;
429
-	}
430
-
431
-	/**
432
-	 * Parses <meta> tags
433
-	 *
434
-	 * @param DOMDocument $doc Document
435
-	 * @return array
436
-	 */
437
-	public function parseMetaTags(DOMDocument $doc) {
438
-
439
-		$meta = array();
440
-
441
-		$nodes = $doc->getElementsByTagName('meta');
442
-		if (!empty($nodes)) {
443
-			foreach ($nodes as $node) {
444
-				$name = $node->getAttribute('name');
445
-				if (!$name) {
446
-					$name = $node->getAttribute('property');
447
-				}
448
-				if (!$name) {
449
-					continue;
450
-				}
451
-
452
-				$name = strtolower($name);
453
-
454
-				if ($name == 'og:image:url' || $name == 'og:image:secure_url') {
455
-					$name = 'og:image';
456
-				}
457
-
458
-				$content = $node->getAttribute('content');
459
-				if (isset($meta['metatags'][$name])) {
460
-					if (!is_array($meta['metatags'][$name])) {
461
-						$meta['metatags'][$name] = array($meta['metatags'][$name]);
462
-					}
463
-					$meta['metatags'][$name][] = $content;
464
-				} else {
465
-					$meta['metatags'][$name] = $content;
466
-				}
467
-
468
-				switch ($name) {
469
-
470
-					case 'title' :
471
-					case 'og:title' :
472
-					case 'twitter:title' :
473
-						if (empty($meta['title'])) {
474
-							$meta['title'] = $content;
475
-						}
476
-						break;
477
-
478
-					case 'og:type' :
479
-						if (empty($meta['type'])) {
480
-							$meta['type'] = $content;
481
-						}
482
-						break;
483
-
484
-					case 'description' :
485
-					case 'og:description' :
486
-					case 'twitter:description' :
487
-						if (empty($meta['description'])) {
488
-							$meta['description'] = $content;
489
-						}
490
-						break;
491
-
492
-					case 'keywords' :
493
-						if (is_string($content)) {
494
-							$content = explode(',', $content);
495
-							$content = array_map('trim', $content);
496
-						}
497
-						$meta['tags'] = $content;
498
-						break;
499
-
500
-					case 'og:site_name' :
501
-					case 'twitter:site' :
502
-						if (empty($meta['provider_name'])) {
503
-							$meta['provider_name'] = $content;
504
-						}
505
-						break;
506
-
507
-					case 'og:image' :
508
-					case 'twitter:image' :
509
-						$image_url = $this->getAbsoluteURL($doc, $content);
510
-						if ($this->isImage($image_url)) {
511
-							$meta['thumbnails'][] = $image_url;
512
-						}
513
-						break;
514
-				}
515
-			}
516
-		}
517
-
518
-		return $meta;
519
-	}
520
-
521
-	/**
522
-	 * Parses <img> tags
523
-	 *
524
-	 * @param DOMDocument $doc Document
525
-	 * @return array
526
-	 */
527
-	public function parseImgTags(DOMDocument $doc) {
528
-
529
-		$meta = array(
530
-			'thumbnails' => [],
531
-		);
532
-
533
-		$nodes = $doc->getElementsByTagName('img');
534
-		foreach ($nodes as $node) {
535
-			$src = $node->getAttribute('src');
536
-			$image_url = $this->getAbsoluteURL($doc, $src);
537
-			if ($this->isImage($image_url)) {
538
-				$meta['thumbnails'][] = $image_url;
539
-			}
540
-		}
541
-
542
-		return $meta;
543
-	}
544
-
545
-	/**
546
-	 * Normalizes relative URLs
547
-	 *
548
-	 * @param DOMDocument $doc  Document
549
-	 * @param string      $href URL to normalize
550
-	 * @return string|false
551
-	 */
552
-	public function getAbsoluteURL(DOMDocument $doc, $href = '') {
553
-
554
-		if (preg_match("/^data:/i", $href)) {
555
-			// data URIs can not be resolved
556
-			return false;
557
-		}
558
-
559
-		// Check if $url is absolute
560
-		if (parse_url($href, PHP_URL_HOST)) {
561
-			return $href;
562
-		}
563
-
564
-		$uri = trim($doc->documentURI ?: '', '/');
565
-
566
-		$scheme = parse_url($uri, PHP_URL_SCHEME);
567
-		$host = parse_url($uri, PHP_URL_HOST);
568
-
569
-		if (substr($href, 0, 1) === "/") {
570
-			// URL is relative to site root
571
-			return "$scheme://$host$href";
572
-		}
573
-
574
-		// URL is relative to page
575
-		$path = parse_url($uri, PHP_URL_PATH);
576
-
577
-		return "$scheme://$host$path/$href";
578
-	}
15
+    /**
16
+     * @var ClientInterface
17
+     */
18
+    private $client;
19
+
20
+    /**
21
+     * @var array
22
+     */
23
+    private static $cache;
24
+
25
+    /**
26
+     * @var array
27
+     */
28
+    private $urls = [];
29
+
30
+    /**
31
+     * Constructor
32
+     * @param ClientInterface $client HTTP Client
33
+     */
34
+    public function __construct(ClientInterface $client) {
35
+        $this->client = $client;
36
+    }
37
+
38
+    /**
39
+     * Parses a URL into a an array of metatags
40
+     *
41
+     * @param string $url URL to parse
42
+     * @return array
43
+     */
44
+    public function parse($url = '') {
45
+        $data = $this->getImageData($url);
46
+        if (!$data) {
47
+            $data = $this->getOEmbedData($url);
48
+        }
49
+        if (!$data) {
50
+            $data = $this->getDOMData($url);
51
+            if (is_array($data) && !empty($data['oembed_url'])) {
52
+                foreach ($data['oembed_url'] as $oembed_url) {
53
+                    $oembed_data = $this->getOEmbedData($oembed_url);
54
+                    if (!empty($oembed_data) && is_array($oembed_data)) {
55
+                        $oembed_data['oembed_url'] = $oembed_data['url'];
56
+                        unset($oembed_data['url']);
57
+                        $data = array_merge($data, $oembed_data);
58
+                    }
59
+                }
60
+            }
61
+        }
62
+
63
+        if (!is_array($data)) {
64
+            $data = array();
65
+        }
66
+
67
+        if (empty($data['thumbnail_url']) && !empty($data['thumbnails'])) {
68
+            $data['thumbnail_url'] = $data['thumbnails'][0];
69
+        }
70
+
71
+        return $data;
72
+    }
73
+
74
+    /**
75
+     * Parses image metatags
76
+     *
77
+     * @param string $url URL of the image
78
+     * @return array|false
79
+     */
80
+    public function getImageData($url = '') {
81
+        if (!$this->isImage($url)) {
82
+            return false;
83
+        }
84
+
85
+        return array(
86
+            'type' => 'photo',
87
+            'url' => $url,
88
+            'thumbnails' => array($url),
89
+        );
90
+    }
91
+
92
+    /**
93
+     * Parses OEmbed data
94
+     *
95
+     * @param  string $url URL of the image
96
+     * @return array|false
97
+     */
98
+    public function getOEmbedData($url = '') {
99
+
100
+        if (!$this->isJSON($url) && !$this->isXML($url)) {
101
+            return false;
102
+        }
103
+
104
+        $meta = array(
105
+            'url' => $url,
106
+        );
107
+
108
+        $content = $this->read($url);
109
+        if (!$content) {
110
+            return $meta;
111
+        }
112
+
113
+        $data = new \stdClass();
114
+        if ($this->isJSON($url)) {
115
+            $data = json_decode($content);
116
+        } else if ($this->isXML($url)) {
117
+            $data = simplexml_load_string($content);
118
+        }
119
+
120
+        $props = array(
121
+            'type',
122
+            'version',
123
+            'title',
124
+            'author_name',
125
+            'author_url',
126
+            'provider_name',
127
+            'provider_url',
128
+            'cache_age',
129
+            'thumbnail_url',
130
+            'thumbnail_width',
131
+            'thumbnail_height',
132
+            'width',
133
+            'height',
134
+            'html',
135
+        );
136
+        foreach ($props as $key) {
137
+            if (!empty($data->$key)) {
138
+                $meta[$key] = (string) $data->$key;
139
+            }
140
+        }
141
+        return $meta;
142
+    }
143
+
144
+    /**
145
+     * Parses metatags from DOM
146
+     *
147
+     * @param  string $url URL
148
+     * @return array|false
149
+     */
150
+    public function getDOMData($url = '') {
151
+
152
+        if (!$this->isHTML($url)) {
153
+            return false;
154
+        }
155
+
156
+        $doc = $this->getDOM($url);
157
+        if (!$doc) {
158
+            return false;
159
+        }
160
+
161
+        $defaults = array(
162
+            'url' => $url,
163
+        );
164
+
165
+        $link_tags = $this->parseLinkTags($doc);
166
+        $meta_tags = $this->parseMetaTags($doc);
167
+        $img_tags = $this->parseImgTags($doc);
168
+
169
+        $meta = array_merge_recursive($defaults, $link_tags, $meta_tags, $img_tags);
170
+
171
+        if (empty($meta['title'])) {
172
+            $meta['title'] = $this->parseTitle($doc);
173
+        }
174
+
175
+
176
+        return $meta;
177
+    }
178
+
179
+    /**
180
+     * Check if URL exists and is reachable by making an HTTP request to retrieve header information
181
+     *
182
+     * @param string $url URL of the resource
183
+     * @return boolean
184
+     */
185
+    public function exists($url = '') {
186
+        $response = $this->request($url);
187
+        if ($response instanceof Response) {
188
+            return $response->getStatusCode() == 200;
189
+        }
190
+        return false;
191
+    }
192
+
193
+    /**
194
+     * Validate URL
195
+     *
196
+     * @param string $url URL to validate
197
+     * @return bool
198
+     */
199
+    public function isValidUrl($url = '') {
200
+        // based on http://php.net/manual/en/function.filter-var.php#104160
201
+        // adapted by @mrclay in https://github.com/mrclay/Elgg-leaf/blob/62bf31c0ccdaab549a7e585a4412443e09821db3/engine/lib/output.php
202
+        $res = filter_var($url, FILTER_VALIDATE_URL);
203
+        if ($res) {
204
+            return $res;
205
+        }
206
+        // Check if it has unicode chars.
207
+        $l = mb_strlen($url);
208
+        if (strlen($url) == $l) {
209
+            return $res;
210
+        }
211
+        // Replace wide chars by “X”.
212
+        $s = '';
213
+        for ($i = 0; $i < $l; ++$i) {
214
+            $ch = elgg_substr($url, $i, 1);
215
+            $s .= (strlen($ch) > 1) ? 'X' : $ch;
216
+        }
217
+        // Re-check now.
218
+        return filter_var($s, FILTER_VALIDATE_URL) ? $url : false;
219
+    }
220
+
221
+    /**
222
+     * Returns head of the resource
223
+     *
224
+     * @param string $url URL of the resource
225
+     * @return Response|false
226
+     */
227
+    public function request($url = '') {
228
+        $url = str_replace(' ', '%20', $url);
229
+        if (!$this->isValidUrl($url)) {
230
+            return false;
231
+        }
232
+        if (!isset(self::$cache[$url])) {
233
+            try {
234
+                $response = $this->client->request('GET', $url);
235
+            } catch (Exception $e) {
236
+                $response = false;
237
+                error_log("Parser Error for HEAD request ($url): {$e->getMessage()}");
238
+            }
239
+            self::$cache[$url] = $response;
240
+        }
241
+
242
+        return self::$cache[$url];
243
+    }
244
+
245
+    /**
246
+     * Get contents of the page
247
+     *
248
+     * @param string $url URL of the resource
249
+     * @return string
250
+     */
251
+    public function read($url = '') {
252
+        $body = '';
253
+        if (!$this->exists($url)) {
254
+            return $body;
255
+        }
256
+
257
+        $response = $this->request($url);
258
+        $body = (string) $response->getBody();
259
+        return $body;
260
+    }
261
+
262
+    /**
263
+     * Checks if resource is an html page
264
+     *
265
+     * @param string $url URL of the resource
266
+     * @return boolean
267
+     */
268
+    public function isHTML($url = '') {
269
+        $mime = $this->getContentType($url);
270
+        return strpos($mime, 'text/html') !== false;
271
+    }
272
+
273
+    /**
274
+     * Checks if resource is JSON
275
+     *
276
+     * @param string $url URL of the resource
277
+     * @return boolean
278
+     */
279
+    public function isJSON($url = '') {
280
+        $mime = $this->getContentType($url);
281
+        return strpos($mime, 'json') !== false;
282
+    }
283
+
284
+    /**
285
+     * Checks if resource is XML
286
+     *
287
+     * @param string $url URL of the resource
288
+     * @return boolean
289
+     */
290
+    public function isXML($url = '') {
291
+        $mime = $this->getContentType($url);
292
+        return strpos($mime, 'xml') !== false;
293
+    }
294
+
295
+    /**
296
+     * Checks if resource is an image
297
+     *
298
+     * @param string $url URL of the resource
299
+     * @return boolean
300
+     */
301
+    public function isImage($url = '') {
302
+        $mime = $this->getContentType($url);
303
+        if ($mime) {
304
+            list($simple, ) = explode('/', $mime);
305
+            return ($simple == 'image');
306
+        }
307
+
308
+        return false;
309
+    }
310
+
311
+    /**
312
+     * Get mime type of the URL content
313
+     *
314
+     * @param string $url URL of the resource
315
+     * @return string
316
+     */
317
+    public function getContentType($url = '') {
318
+        $response = $this->request($url);
319
+        if ($response instanceof Response) {
320
+            $header = $response->getHeader('Content-Type');
321
+            if (is_array($header) && !empty($header)) {
322
+                $parts = explode(';', $header[0]);
323
+                return trim($parts[0]);
324
+            }
325
+        }
326
+        return '';
327
+    }
328
+
329
+    /**
330
+     * Returns HTML contents of the page
331
+     *
332
+     * @param string $url URL of the resource
333
+     * @return string
334
+     */
335
+    public function getHTML($url = '') {
336
+        if (!$this->isHTML($url)) {
337
+            return '';
338
+        }
339
+        return $this->read($url);
340
+    }
341
+
342
+    /**
343
+     * Returns HTML contents of the page as a DOMDocument
344
+     *
345
+     * @param string $url URL of the resource
346
+     * @return DOMDocument|false
347
+     */
348
+    public function getDOM($url = '') {
349
+        $html = $this->getHTML($url);
350
+        if (empty($html)) {
351
+            return false;
352
+        }
353
+        $doc = new DOMDocument();
354
+
355
+        libxml_use_internal_errors(true);
356
+
357
+        if (is_callable('mb_convert_encoding')) {
358
+            $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
359
+        } else {
360
+            $doc->loadHTML($html);
361
+        }
362
+        if (!$doc->documentURI) {
363
+            $doc->documentURI = $url;
364
+        }
365
+
366
+        libxml_clear_errors();
367
+
368
+        return $doc;
369
+    }
370
+
371
+    /**
372
+     * Parses document title
373
+     *
374
+     * @param DOMDocument $doc Document
375
+     * @return string
376
+     */
377
+    public function parseTitle(DOMDocument $doc) {
378
+        $node = $doc->getElementsByTagName('title');
379
+        $title = $node->item(0)->nodeValue;
380
+        return ($title) ?: '';
381
+    }
382
+
383
+    /**
384
+     * Parses <link> tags
385
+     *
386
+     * @param DOMDocument $doc Document
387
+     * @return array
388
+     */
389
+    public function parseLinkTags(DOMDocument $doc) {
390
+
391
+        $meta = array(
392
+            'icons' => [],
393
+            'thumbnails' => [],
394
+        );
395
+
396
+        $nodes = $doc->getElementsByTagName('link');
397
+        foreach ($nodes as $node) {
398
+            $rel = $node->getAttribute('rel');
399
+            $href = $node->getAttribute('href');
400
+
401
+            switch ($rel) {
402
+
403
+                case 'icon' :
404
+                    $image_url = $this->getAbsoluteURL($doc, $href);
405
+                    if ($this->isImage($image_url)) {
406
+                        $meta['icons'][] = $image_url;
407
+                    }
408
+                    break;
409
+
410
+                case 'canonical' :
411
+                    $meta['canonical'] = $this->getAbsoluteURL($doc, $href);
412
+                    break;
413
+
414
+                case 'alternate' :
415
+                    $type = $node->getAttribute('type');
416
+                    if (in_array($type, array(
417
+                        'application/json+oembed',
418
+                        'text/json+oembed',
419
+                        'application/xml+oembed',
420
+                        'text/xml+oembed'
421
+                    ))) {
422
+                        $meta['oembed_url'][] = $this->getAbsoluteURL($doc, $href);
423
+                    }
424
+                    break;
425
+            }
426
+        }
427
+
428
+        return $meta;
429
+    }
430
+
431
+    /**
432
+     * Parses <meta> tags
433
+     *
434
+     * @param DOMDocument $doc Document
435
+     * @return array
436
+     */
437
+    public function parseMetaTags(DOMDocument $doc) {
438
+
439
+        $meta = array();
440
+
441
+        $nodes = $doc->getElementsByTagName('meta');
442
+        if (!empty($nodes)) {
443
+            foreach ($nodes as $node) {
444
+                $name = $node->getAttribute('name');
445
+                if (!$name) {
446
+                    $name = $node->getAttribute('property');
447
+                }
448
+                if (!$name) {
449
+                    continue;
450
+                }
451
+
452
+                $name = strtolower($name);
453
+
454
+                if ($name == 'og:image:url' || $name == 'og:image:secure_url') {
455
+                    $name = 'og:image';
456
+                }
457
+
458
+                $content = $node->getAttribute('content');
459
+                if (isset($meta['metatags'][$name])) {
460
+                    if (!is_array($meta['metatags'][$name])) {
461
+                        $meta['metatags'][$name] = array($meta['metatags'][$name]);
462
+                    }
463
+                    $meta['metatags'][$name][] = $content;
464
+                } else {
465
+                    $meta['metatags'][$name] = $content;
466
+                }
467
+
468
+                switch ($name) {
469
+
470
+                    case 'title' :
471
+                    case 'og:title' :
472
+                    case 'twitter:title' :
473
+                        if (empty($meta['title'])) {
474
+                            $meta['title'] = $content;
475
+                        }
476
+                        break;
477
+
478
+                    case 'og:type' :
479
+                        if (empty($meta['type'])) {
480
+                            $meta['type'] = $content;
481
+                        }
482
+                        break;
483
+
484
+                    case 'description' :
485
+                    case 'og:description' :
486
+                    case 'twitter:description' :
487
+                        if (empty($meta['description'])) {
488
+                            $meta['description'] = $content;
489
+                        }
490
+                        break;
491
+
492
+                    case 'keywords' :
493
+                        if (is_string($content)) {
494
+                            $content = explode(',', $content);
495
+                            $content = array_map('trim', $content);
496
+                        }
497
+                        $meta['tags'] = $content;
498
+                        break;
499
+
500
+                    case 'og:site_name' :
501
+                    case 'twitter:site' :
502
+                        if (empty($meta['provider_name'])) {
503
+                            $meta['provider_name'] = $content;
504
+                        }
505
+                        break;
506
+
507
+                    case 'og:image' :
508
+                    case 'twitter:image' :
509
+                        $image_url = $this->getAbsoluteURL($doc, $content);
510
+                        if ($this->isImage($image_url)) {
511
+                            $meta['thumbnails'][] = $image_url;
512
+                        }
513
+                        break;
514
+                }
515
+            }
516
+        }
517
+
518
+        return $meta;
519
+    }
520
+
521
+    /**
522
+     * Parses <img> tags
523
+     *
524
+     * @param DOMDocument $doc Document
525
+     * @return array
526
+     */
527
+    public function parseImgTags(DOMDocument $doc) {
528
+
529
+        $meta = array(
530
+            'thumbnails' => [],
531
+        );
532
+
533
+        $nodes = $doc->getElementsByTagName('img');
534
+        foreach ($nodes as $node) {
535
+            $src = $node->getAttribute('src');
536
+            $image_url = $this->getAbsoluteURL($doc, $src);
537
+            if ($this->isImage($image_url)) {
538
+                $meta['thumbnails'][] = $image_url;
539
+            }
540
+        }
541
+
542
+        return $meta;
543
+    }
544
+
545
+    /**
546
+     * Normalizes relative URLs
547
+     *
548
+     * @param DOMDocument $doc  Document
549
+     * @param string      $href URL to normalize
550
+     * @return string|false
551
+     */
552
+    public function getAbsoluteURL(DOMDocument $doc, $href = '') {
553
+
554
+        if (preg_match("/^data:/i", $href)) {
555
+            // data URIs can not be resolved
556
+            return false;
557
+        }
558
+
559
+        // Check if $url is absolute
560
+        if (parse_url($href, PHP_URL_HOST)) {
561
+            return $href;
562
+        }
563
+
564
+        $uri = trim($doc->documentURI ?: '', '/');
565
+
566
+        $scheme = parse_url($uri, PHP_URL_SCHEME);
567
+        $host = parse_url($uri, PHP_URL_HOST);
568
+
569
+        if (substr($href, 0, 1) === "/") {
570
+            // URL is relative to site root
571
+            return "$scheme://$host$href";
572
+        }
573
+
574
+        // URL is relative to page
575
+        $path = parse_url($uri, PHP_URL_PATH);
576
+
577
+        return "$scheme://$host$path/$href";
578
+    }
579 579
 
580 580
 }
581 581
\ No newline at end of file
Please login to merge, or discard this patch.