Test Failed
Push — master ( 56550f...b49bd0 )
by Ismayil
03:13
created
src/hypeJunction/Parser.php 3 patches
Doc Comments   +1 added lines, -1 removed lines patch added patch discarded remove patch
@@ -339,7 +339,7 @@
 block discarded – undo
339 339
 	 * Returns HTML contents of the page as a DOMDocument
340 340
 	 *
341 341
 	 * @param string $url URL of the resource
342
-	 * @return DOMDocument|false
342
+	 * @return DOMDocument
343 343
 	 */
344 344
 	public function getDOM($url = '') {
345 345
 		$html = $this->getHTML($url);
Please login to merge, or discard this patch.
Indentation   +543 added lines, -543 removed lines patch added patch discarded remove patch
@@ -12,548 +12,548 @@
 block discarded – undo
12 12
  */
13 13
 class Parser {
14 14
 
15
-	/**
16
-	 * @var ClientInterface
17
-	 */
18
-	private $client;
19
-
20
-	/**
21
-	 * @var array
22
-	 */
23
-	private static $cache;
24
-
25
-	/**
26
-	 * Constructor
27
-	 * @param ClientInterface $client HTTP Client
28
-	 */
29
-	public function __construct(ClientInterface $client) {
30
-		$this->client = $client;
31
-	}
32
-
33
-	/**
34
-	 * Parses a URL into a an array of metatags
35
-	 *
36
-	 * @param string $url URL to parse
37
-	 * @return array
38
-	 */
39
-	public function parse($url = '') {
40
-
41
-		$data = $this->getImageData($url);
42
-		if (!$data) {
43
-			$data = $this->getOEmbedData($url);
44
-		}
45
-		if (!$data) {
46
-			$data = $this->getDOMData($url);
47
-			if (is_array($data) && !empty($data['oembed_url'])) {
48
-				foreach ($data['oembed_url'] as $oembed_url) {
49
-					$oembed_data = $this->parse($oembed_url);
50
-					if (!empty($oembed_data) && is_array($oembed_data)) {
51
-						$oembed_data['oembed_url'] = $oembed_data['url'];
52
-						unset($oembed_data['url']);
53
-						$data = array_merge($data, $oembed_data);
54
-					}
55
-				}
56
-			}
57
-		}
58
-
59
-		if (!is_array($data)) {
60
-			$data = array();
61
-		}
62
-
63
-		if (empty($data['thumbnail_url']) && !empty($data['thumbnails'])) {
64
-			$data['thumbnail_url'] = $data['thumbnails'][0];
65
-		}
66
-
67
-		return $data;
68
-	}
69
-
70
-	/**
71
-	 * Parses image metatags
72
-	 *
73
-	 * @param string $url URL of the image
74
-	 * @return array|false
75
-	 */
76
-	public function getImageData($url = '') {
77
-		if (!$this->isImage($url)) {
78
-			return false;
79
-		}
80
-
81
-		return array(
82
-			'type' => 'photo',
83
-			'url' => $url,
84
-			'thumbnails' => array($url),
85
-		);
86
-	}
87
-
88
-	/**
89
-	 * Parses OEmbed data
90
-	 *
91
-	 * @param  string $url URL of the image
92
-	 * @return array|false
93
-	 */
94
-	public function getOEmbedData($url = '') {
95
-
96
-		if (!$this->isJSON($url) && !$this->isXML($url)) {
97
-			return false;
98
-		}
99
-
100
-		$meta = array(
101
-			'url' => $url,
102
-		);
103
-
104
-		$content = $this->read($url);
105
-		if (!$content) {
106
-			return $meta;
107
-		}
108
-
109
-		$data = new \stdClass();
110
-		if ($this->isJSON($url)) {
111
-			$data = json_decode($content);
112
-		} else if ($this->isXML($url)) {
113
-			$data = simplexml_load_string($content);
114
-		}
115
-
116
-		$props = array(
117
-			'type',
118
-			'version',
119
-			'title',
120
-			'author_name',
121
-			'author_url',
122
-			'provider_name',
123
-			'provider_url',
124
-			'cache_age',
125
-			'thumbnail_url',
126
-			'thumbnail_width',
127
-			'thumbnail_height',
128
-			'width',
129
-			'height',
130
-			'html',
131
-		);
132
-		foreach ($props as $key) {
133
-			if (!empty($data->$key)) {
134
-				$meta[$key] = (string) $data->$key;
135
-			}
136
-		}
137
-		return $meta;
138
-	}
139
-
140
-	/**
141
-	 * Parses metatags from DOM
142
-	 *
143
-	 * @param  string $url URL
144
-	 * @return array|false
145
-	 */
146
-	public function getDOMData($url = '') {
147
-
148
-		if (!$this->isHTML($url)) {
149
-			return false;
150
-		}
151
-
152
-		$doc = $this->getDOM($url);
153
-		if (!$doc) {
154
-			return false;
155
-		}
156
-
157
-		$defaults = array(
158
-			'url' => $url,
159
-		);
160
-
161
-		$link_tags = $this->parseLinkTags($doc);
162
-		$meta_tags = $this->parseMetaTags($doc);
163
-		$img_tags = $this->parseImgTags($doc);
164
-
165
-		$meta = array_merge_recursive($defaults, $link_tags, $meta_tags, $img_tags);
166
-
167
-		if (empty($meta['title'])) {
168
-			$meta['title'] = $this->parseTitle($doc);
169
-		}
170
-
171
-
172
-		return $meta;
173
-	}
174
-
175
-	/**
176
-	 * Check if URL exists and is reachable by making an HTTP request to retrieve header information
177
-	 *
178
-	 * @param string $url URL of the resource
179
-	 * @return boolean
180
-	 */
181
-	public function exists($url = '') {
182
-		$response = $this->request($url);
183
-		if ($response instanceof Response) {
184
-			return $response->getStatusCode() == 200;
185
-		}
186
-		return false;
187
-	}
188
-
189
-	/**
190
-	 * Validate URL
191
-	 * 
192
-	 * @param string $url URL to validate
193
-	 * @return bool
194
-	 */
195
-	public function isValidUrl($url = '') {
196
-		// based on http://php.net/manual/en/function.filter-var.php#104160
197
-		// adapted by @mrclay in https://github.com/mrclay/Elgg-leaf/blob/62bf31c0ccdaab549a7e585a4412443e09821db3/engine/lib/output.php
198
-		$res = filter_var($url, FILTER_VALIDATE_URL);
199
-		if ($res) {
200
-			return $res;
201
-		}
202
-		// Check if it has unicode chars.
203
-		$l = elgg_strlen($url);
204
-		if (strlen($url) == $l) {
205
-			return $res;
206
-		}
207
-		// Replace wide chars by “X”.
208
-		$s = '';
209
-		for ($i = 0; $i < $l; ++$i) {
210
-			$ch = elgg_substr($url, $i, 1);
211
-			$s .= (strlen($ch) > 1) ? 'X' : $ch;
212
-		}
213
-		// Re-check now.
214
-		return filter_var($s, FILTER_VALIDATE_URL) ? $url : false;
215
-	}
216
-
217
-	/**
218
-	 * Returns head of the resource
219
-	 *
220
-	 * @param string $url URL of the resource
221
-	 * @return Response|false
222
-	 */
223
-	public function request($url = '') {
224
-		$url = str_replace(' ', '%20', $url);
225
-		if (!$this->isValidUrl($url)) {
226
-			return false;
227
-		}
228
-		if (!isset(self::$cache[$url])) {
229
-			try {
230
-				$response = $this->client->request('GET', $url);
231
-			} catch (Exception $e) {
232
-				$response = false;
233
-				error_log("Parser Error for HEAD request ($url): {$e->getMessage()}");
234
-			}
235
-			self::$cache[$url] = $response;
236
-		}
237
-
238
-		return self::$cache[$url];
239
-	}
240
-
241
-	/**
242
-	 * Get contents of the page
243
-	 *
244
-	 * @param string $url URL of the resource
245
-	 * @return string
246
-	 */
247
-	public function read($url = '') {
248
-		$body = '';
249
-		if (!$this->exists($url)) {
250
-			return $body;
251
-		}
252
-
253
-		$response = $this->request($url);
254
-		$body = (string) $response->getBody();
255
-		return $body;
256
-	}
257
-
258
-	/**
259
-	 * Checks if resource is an html page
260
-	 *
261
-	 * @param string $url URL of the resource
262
-	 * @return boolean
263
-	 */
264
-	public function isHTML($url = '') {
265
-		$mime = $this->getContentType($url);
266
-		return strpos($mime, 'text/html') !== false;
267
-	}
268
-
269
-	/**
270
-	 * Checks if resource is JSON
271
-	 *
272
-	 * @param string $url URL of the resource
273
-	 * @return boolean
274
-	 */
275
-	public function isJSON($url = '') {
276
-		$mime = $this->getContentType($url);
277
-		return strpos($mime, 'json') !== false;
278
-	}
279
-
280
-	/**
281
-	 * Checks if resource is XML
282
-	 *
283
-	 * @param string $url URL of the resource
284
-	 * @return boolean
285
-	 */
286
-	public function isXML($url = '') {
287
-		$mime = $this->getContentType($url);
288
-		return strpos($mime, 'xml') !== false;
289
-	}
290
-
291
-	/**
292
-	 * Checks if resource is an image
293
-	 *
294
-	 * @param string $url URL of the resource
295
-	 * @return boolean
296
-	 */
297
-	public function isImage($url = '') {
298
-		$mime = $this->getContentType($url);
299
-		if ($mime) {
300
-			list($simple, ) = explode('/', $mime);
301
-			return ($simple == 'image');
302
-		}
303
-
304
-		return false;
305
-	}
306
-
307
-	/**
308
-	 * Get mime type of the URL content
309
-	 *
310
-	 * @param string $url URL of the resource
311
-	 * @return string
312
-	 */
313
-	public function getContentType($url = '') {
314
-		$response = $this->request($url);
315
-		if ($response instanceof Response) {
316
-			$header = $response->getHeader('Content-Type');
317
-			if (is_array($header) && !empty($header)) {
318
-				$parts = explode(';', $header[0]);
319
-				return trim($parts[0]);
320
-			}
321
-		}
322
-		return '';
323
-	}
324
-
325
-	/**
326
-	 * Returns HTML contents of the page
327
-	 *
328
-	 * @param string $url URL of the resource
329
-	 * @return string
330
-	 */
331
-	public function getHTML($url = '') {
332
-		if (!$this->isHTML($url)) {
333
-			return '';
334
-		}
335
-		return $this->read($url);
336
-	}
337
-
338
-	/**
339
-	 * Returns HTML contents of the page as a DOMDocument
340
-	 *
341
-	 * @param string $url URL of the resource
342
-	 * @return DOMDocument|false
343
-	 */
344
-	public function getDOM($url = '') {
345
-		$html = $this->getHTML($url);
346
-		if (empty($html)) {
347
-			return false;
348
-		}
349
-		$doc = new DOMDocument();
350
-		if (is_callable('mb_convert_encoding')) {
351
-			$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
352
-		} else {
353
-			$doc->loadHTML($html);
354
-		}
355
-		if (!$doc->documentURI) {
356
-			$doc->documentURI = $url;
357
-		}
358
-		return $doc;
359
-	}
360
-
361
-	/**
362
-	 * Parses document title
363
-	 *
364
-	 * @param DOMDocument $doc Document
365
-	 * @return string
366
-	 */
367
-	public function parseTitle(DOMDocument $doc) {
368
-		$node = $doc->getElementsByTagName('title');
369
-		$title = $node->item(0)->nodeValue;
370
-		return ($title) ?: '';
371
-	}
372
-
373
-	/**
374
-	 * Parses <link> tags
375
-	 *
376
-	 * @param DOMDocument $doc Document
377
-	 * @return array
378
-	 */
379
-	public function parseLinkTags(DOMDocument $doc) {
380
-
381
-		$meta = array();
382
-
383
-		$nodes = $doc->getElementsByTagName('link');
384
-		foreach ($nodes as $node) {
385
-			$rel = $node->getAttribute('rel');
386
-			$href = $node->getAttribute('href');
387
-
388
-			switch ($rel) {
389
-
390
-				case 'icon' :
391
-					$image_url = $this->getAbsoluteURL($doc, $href);
392
-					if ($this->isImage($image_url)) {
393
-						$meta['icons'][] = $image_url;
394
-					}
395
-					break;
396
-
397
-				case 'canonical' :
398
-					$meta['canonical'] = $this->getAbsoluteURL($doc, $href);
399
-					break;
400
-
401
-				case 'alternate' :
402
-					$type = $node->getAttribute('type');
403
-					if (in_array($type, array(
404
-								'application/json+oembed',
405
-								'text/json+oembed',
406
-								'application/xml+oembed',
407
-								'text/xml+oembed'
408
-							))) {
409
-						$meta['oembed_url'][] = $this->getAbsoluteURL($doc, $href);
410
-					}
411
-					break;
412
-			}
413
-		}
414
-
415
-		return $meta;
416
-	}
417
-
418
-	/**
419
-	 * Parses <meta> tags
420
-	 *
421
-	 * @param DOMDocument $doc Document
422
-	 * @return array
423
-	 */
424
-	public function parseMetaTags(DOMDocument $doc) {
425
-
426
-		$meta = array();
427
-
428
-		$nodes = $doc->getElementsByTagName('meta');
429
-		if (!empty($nodes)) {
430
-			foreach ($nodes as $node) {
431
-				$name = $node->getAttribute('name');
432
-				if (!$name) {
433
-					$name = $node->getAttribute('property');
434
-				}
435
-				if (!$name) {
436
-					continue;
437
-				}
438
-
439
-				$name = strtolower($name);
440
-
441
-				$content = $node->getAttribute('content');
442
-				if (isset($meta['metatags'][$name])) {
443
-					if (!is_array($meta['metatags'][$name])) {
444
-						$meta['metatags'][$name] = array($meta['metatags'][$name]);
445
-					}
446
-					$meta['metatags'][$name][] = $content;
447
-				} else {
448
-					$meta['metatags'][$name] = $content;
449
-				}
450
-
451
-				switch ($name) {
452
-
453
-					case 'title' :
454
-					case 'og:title' :
455
-					case 'twitter:title' :
456
-						if (empty($meta['title'])) {
457
-							$meta['title'] = $content;
458
-						}
459
-						break;
460
-
461
-					case 'og:type' :
462
-						if (empty($meta['type'])) {
463
-							$meta['type'] = $content;
464
-						}
465
-						break;
466
-
467
-					case 'description' :
468
-					case 'og:description' :
469
-					case 'twitter:description' :
470
-						if (empty($meta['description'])) {
471
-							$meta['description'] = $content;
472
-						}
473
-						break;
474
-
475
-					case 'keywords' :
476
-						if (is_string($content)) {
477
-							$content = explode(',', $content);
478
-							$content = array_map('trim', $content);
479
-						}
480
-						$meta['tags'] = $content;
481
-						break;
482
-
483
-					case 'og:site_name' :
484
-					case 'twitter:site' :
485
-						if (empty($meta['provider_name'])) {
486
-							$meta['provider_name'] = $content;
487
-						}
488
-						break;
489
-
490
-					case 'og:image' :
491
-					case 'twitter:image' :
492
-						$image_url = $this->getAbsoluteURL($doc, $content);
493
-						if ($this->isImage($image_url)) {
494
-							$meta['thumbnails'][] = $image_url;
495
-						}
496
-						break;
497
-				}
498
-			}
499
-		}
500
-
501
-		return $meta;
502
-	}
503
-
504
-	/**
505
-	 * Parses <img> tags
506
-	 *
507
-	 * @param DOMDocument $doc Document
508
-	 * @return array
509
-	 */
510
-	public function parseImgTags(DOMDocument $doc) {
511
-
512
-		$meta = array();
513
-
514
-		$nodes = $doc->getElementsByTagName('img');
515
-		foreach ($nodes as $node) {
516
-			$src = $node->getAttribute('src');
517
-			$image_url = $this->getAbsoluteURL($doc, $src);
518
-			if ($this->isImage($image_url)) {
519
-				$meta['thumbnails'][] = $image_url;
520
-			}
521
-		}
522
-
523
-		return $meta;
524
-	}
525
-
526
-	/**
527
-	 * Normalizes relative URLs
528
-	 *
529
-	 * @param DOMDocument $doc  Document
530
-	 * @param string      $href URL to normalize
531
-	 * @return string|false
532
-	 */
533
-	public function getAbsoluteURL(DOMDocument $doc, $href = '') {
534
-
535
-		if (preg_match("/^data:/i", $href)) {
536
-			// data URIs can not be resolved
537
-			return false;
538
-		}
539
-
540
-		// Check if $url is absolute
541
-		if (parse_url($href, PHP_URL_HOST)) {
542
-			return $href;
543
-		}
544
-
545
-		$uri = trim($doc->documentURI ?: '', '/');
546
-
547
-		// Check if $url is relative to root
548
-		if (substr($href, 0, 1) === "/") {
549
-			$scheme = parse_url($uri, PHP_URL_SCHEME);
550
-			$host = parse_url($uri, PHP_URL_HOST);
551
-			return "$scheme://$host$href";
552
-		}
553
-
554
-		// $url is relative to page
555
-		$uri = pathinfo($uri, PATHINFO_DIRNAME);
556
-		return "$uri/$href";
557
-	}
15
+    /**
16
+     * @var ClientInterface
17
+     */
18
+    private $client;
19
+
20
+    /**
21
+     * @var array
22
+     */
23
+    private static $cache;
24
+
25
+    /**
26
+     * Constructor
27
+     * @param ClientInterface $client HTTP Client
28
+     */
29
+    public function __construct(ClientInterface $client) {
30
+        $this->client = $client;
31
+    }
32
+
33
+    /**
34
+     * Parses a URL into a an array of metatags
35
+     *
36
+     * @param string $url URL to parse
37
+     * @return array
38
+     */
39
+    public function parse($url = '') {
40
+
41
+        $data = $this->getImageData($url);
42
+        if (!$data) {
43
+            $data = $this->getOEmbedData($url);
44
+        }
45
+        if (!$data) {
46
+            $data = $this->getDOMData($url);
47
+            if (is_array($data) && !empty($data['oembed_url'])) {
48
+                foreach ($data['oembed_url'] as $oembed_url) {
49
+                    $oembed_data = $this->parse($oembed_url);
50
+                    if (!empty($oembed_data) && is_array($oembed_data)) {
51
+                        $oembed_data['oembed_url'] = $oembed_data['url'];
52
+                        unset($oembed_data['url']);
53
+                        $data = array_merge($data, $oembed_data);
54
+                    }
55
+                }
56
+            }
57
+        }
58
+
59
+        if (!is_array($data)) {
60
+            $data = array();
61
+        }
62
+
63
+        if (empty($data['thumbnail_url']) && !empty($data['thumbnails'])) {
64
+            $data['thumbnail_url'] = $data['thumbnails'][0];
65
+        }
66
+
67
+        return $data;
68
+    }
69
+
70
+    /**
71
+     * Parses image metatags
72
+     *
73
+     * @param string $url URL of the image
74
+     * @return array|false
75
+     */
76
+    public function getImageData($url = '') {
77
+        if (!$this->isImage($url)) {
78
+            return false;
79
+        }
80
+
81
+        return array(
82
+            'type' => 'photo',
83
+            'url' => $url,
84
+            'thumbnails' => array($url),
85
+        );
86
+    }
87
+
88
+    /**
89
+     * Parses OEmbed data
90
+     *
91
+     * @param  string $url URL of the image
92
+     * @return array|false
93
+     */
94
+    public function getOEmbedData($url = '') {
95
+
96
+        if (!$this->isJSON($url) && !$this->isXML($url)) {
97
+            return false;
98
+        }
99
+
100
+        $meta = array(
101
+            'url' => $url,
102
+        );
103
+
104
+        $content = $this->read($url);
105
+        if (!$content) {
106
+            return $meta;
107
+        }
108
+
109
+        $data = new \stdClass();
110
+        if ($this->isJSON($url)) {
111
+            $data = json_decode($content);
112
+        } else if ($this->isXML($url)) {
113
+            $data = simplexml_load_string($content);
114
+        }
115
+
116
+        $props = array(
117
+            'type',
118
+            'version',
119
+            'title',
120
+            'author_name',
121
+            'author_url',
122
+            'provider_name',
123
+            'provider_url',
124
+            'cache_age',
125
+            'thumbnail_url',
126
+            'thumbnail_width',
127
+            'thumbnail_height',
128
+            'width',
129
+            'height',
130
+            'html',
131
+        );
132
+        foreach ($props as $key) {
133
+            if (!empty($data->$key)) {
134
+                $meta[$key] = (string) $data->$key;
135
+            }
136
+        }
137
+        return $meta;
138
+    }
139
+
140
+    /**
141
+     * Parses metatags from DOM
142
+     *
143
+     * @param  string $url URL
144
+     * @return array|false
145
+     */
146
+    public function getDOMData($url = '') {
147
+
148
+        if (!$this->isHTML($url)) {
149
+            return false;
150
+        }
151
+
152
+        $doc = $this->getDOM($url);
153
+        if (!$doc) {
154
+            return false;
155
+        }
156
+
157
+        $defaults = array(
158
+            'url' => $url,
159
+        );
160
+
161
+        $link_tags = $this->parseLinkTags($doc);
162
+        $meta_tags = $this->parseMetaTags($doc);
163
+        $img_tags = $this->parseImgTags($doc);
164
+
165
+        $meta = array_merge_recursive($defaults, $link_tags, $meta_tags, $img_tags);
166
+
167
+        if (empty($meta['title'])) {
168
+            $meta['title'] = $this->parseTitle($doc);
169
+        }
170
+
171
+
172
+        return $meta;
173
+    }
174
+
175
+    /**
176
+     * Check if URL exists and is reachable by making an HTTP request to retrieve header information
177
+     *
178
+     * @param string $url URL of the resource
179
+     * @return boolean
180
+     */
181
+    public function exists($url = '') {
182
+        $response = $this->request($url);
183
+        if ($response instanceof Response) {
184
+            return $response->getStatusCode() == 200;
185
+        }
186
+        return false;
187
+    }
188
+
189
+    /**
190
+     * Validate URL
191
+     * 
192
+     * @param string $url URL to validate
193
+     * @return bool
194
+     */
195
+    public function isValidUrl($url = '') {
196
+        // based on http://php.net/manual/en/function.filter-var.php#104160
197
+        // adapted by @mrclay in https://github.com/mrclay/Elgg-leaf/blob/62bf31c0ccdaab549a7e585a4412443e09821db3/engine/lib/output.php
198
+        $res = filter_var($url, FILTER_VALIDATE_URL);
199
+        if ($res) {
200
+            return $res;
201
+        }
202
+        // Check if it has unicode chars.
203
+        $l = elgg_strlen($url);
204
+        if (strlen($url) == $l) {
205
+            return $res;
206
+        }
207
+        // Replace wide chars by “X”.
208
+        $s = '';
209
+        for ($i = 0; $i < $l; ++$i) {
210
+            $ch = elgg_substr($url, $i, 1);
211
+            $s .= (strlen($ch) > 1) ? 'X' : $ch;
212
+        }
213
+        // Re-check now.
214
+        return filter_var($s, FILTER_VALIDATE_URL) ? $url : false;
215
+    }
216
+
217
+    /**
218
+     * Returns head of the resource
219
+     *
220
+     * @param string $url URL of the resource
221
+     * @return Response|false
222
+     */
223
+    public function request($url = '') {
224
+        $url = str_replace(' ', '%20', $url);
225
+        if (!$this->isValidUrl($url)) {
226
+            return false;
227
+        }
228
+        if (!isset(self::$cache[$url])) {
229
+            try {
230
+                $response = $this->client->request('GET', $url);
231
+            } catch (Exception $e) {
232
+                $response = false;
233
+                error_log("Parser Error for HEAD request ($url): {$e->getMessage()}");
234
+            }
235
+            self::$cache[$url] = $response;
236
+        }
237
+
238
+        return self::$cache[$url];
239
+    }
240
+
241
+    /**
242
+     * Get contents of the page
243
+     *
244
+     * @param string $url URL of the resource
245
+     * @return string
246
+     */
247
+    public function read($url = '') {
248
+        $body = '';
249
+        if (!$this->exists($url)) {
250
+            return $body;
251
+        }
252
+
253
+        $response = $this->request($url);
254
+        $body = (string) $response->getBody();
255
+        return $body;
256
+    }
257
+
258
+    /**
259
+     * Checks if resource is an html page
260
+     *
261
+     * @param string $url URL of the resource
262
+     * @return boolean
263
+     */
264
+    public function isHTML($url = '') {
265
+        $mime = $this->getContentType($url);
266
+        return strpos($mime, 'text/html') !== false;
267
+    }
268
+
269
+    /**
270
+     * Checks if resource is JSON
271
+     *
272
+     * @param string $url URL of the resource
273
+     * @return boolean
274
+     */
275
+    public function isJSON($url = '') {
276
+        $mime = $this->getContentType($url);
277
+        return strpos($mime, 'json') !== false;
278
+    }
279
+
280
+    /**
281
+     * Checks if resource is XML
282
+     *
283
+     * @param string $url URL of the resource
284
+     * @return boolean
285
+     */
286
+    public function isXML($url = '') {
287
+        $mime = $this->getContentType($url);
288
+        return strpos($mime, 'xml') !== false;
289
+    }
290
+
291
+    /**
292
+     * Checks if resource is an image
293
+     *
294
+     * @param string $url URL of the resource
295
+     * @return boolean
296
+     */
297
+    public function isImage($url = '') {
298
+        $mime = $this->getContentType($url);
299
+        if ($mime) {
300
+            list($simple, ) = explode('/', $mime);
301
+            return ($simple == 'image');
302
+        }
303
+
304
+        return false;
305
+    }
306
+
307
+    /**
308
+     * Get mime type of the URL content
309
+     *
310
+     * @param string $url URL of the resource
311
+     * @return string
312
+     */
313
+    public function getContentType($url = '') {
314
+        $response = $this->request($url);
315
+        if ($response instanceof Response) {
316
+            $header = $response->getHeader('Content-Type');
317
+            if (is_array($header) && !empty($header)) {
318
+                $parts = explode(';', $header[0]);
319
+                return trim($parts[0]);
320
+            }
321
+        }
322
+        return '';
323
+    }
324
+
325
+    /**
326
+     * Returns HTML contents of the page
327
+     *
328
+     * @param string $url URL of the resource
329
+     * @return string
330
+     */
331
+    public function getHTML($url = '') {
332
+        if (!$this->isHTML($url)) {
333
+            return '';
334
+        }
335
+        return $this->read($url);
336
+    }
337
+
338
+    /**
339
+     * Returns HTML contents of the page as a DOMDocument
340
+     *
341
+     * @param string $url URL of the resource
342
+     * @return DOMDocument|false
343
+     */
344
+    public function getDOM($url = '') {
345
+        $html = $this->getHTML($url);
346
+        if (empty($html)) {
347
+            return false;
348
+        }
349
+        $doc = new DOMDocument();
350
+        if (is_callable('mb_convert_encoding')) {
351
+            $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
352
+        } else {
353
+            $doc->loadHTML($html);
354
+        }
355
+        if (!$doc->documentURI) {
356
+            $doc->documentURI = $url;
357
+        }
358
+        return $doc;
359
+    }
360
+
361
+    /**
362
+     * Parses document title
363
+     *
364
+     * @param DOMDocument $doc Document
365
+     * @return string
366
+     */
367
+    public function parseTitle(DOMDocument $doc) {
368
+        $node = $doc->getElementsByTagName('title');
369
+        $title = $node->item(0)->nodeValue;
370
+        return ($title) ?: '';
371
+    }
372
+
373
+    /**
374
+     * Parses <link> tags
375
+     *
376
+     * @param DOMDocument $doc Document
377
+     * @return array
378
+     */
379
+    public function parseLinkTags(DOMDocument $doc) {
380
+
381
+        $meta = array();
382
+
383
+        $nodes = $doc->getElementsByTagName('link');
384
+        foreach ($nodes as $node) {
385
+            $rel = $node->getAttribute('rel');
386
+            $href = $node->getAttribute('href');
387
+
388
+            switch ($rel) {
389
+
390
+                case 'icon' :
391
+                    $image_url = $this->getAbsoluteURL($doc, $href);
392
+                    if ($this->isImage($image_url)) {
393
+                        $meta['icons'][] = $image_url;
394
+                    }
395
+                    break;
396
+
397
+                case 'canonical' :
398
+                    $meta['canonical'] = $this->getAbsoluteURL($doc, $href);
399
+                    break;
400
+
401
+                case 'alternate' :
402
+                    $type = $node->getAttribute('type');
403
+                    if (in_array($type, array(
404
+                                'application/json+oembed',
405
+                                'text/json+oembed',
406
+                                'application/xml+oembed',
407
+                                'text/xml+oembed'
408
+                            ))) {
409
+                        $meta['oembed_url'][] = $this->getAbsoluteURL($doc, $href);
410
+                    }
411
+                    break;
412
+            }
413
+        }
414
+
415
+        return $meta;
416
+    }
417
+
418
+    /**
419
+     * Parses <meta> tags
420
+     *
421
+     * @param DOMDocument $doc Document
422
+     * @return array
423
+     */
424
+    public function parseMetaTags(DOMDocument $doc) {
425
+
426
+        $meta = array();
427
+
428
+        $nodes = $doc->getElementsByTagName('meta');
429
+        if (!empty($nodes)) {
430
+            foreach ($nodes as $node) {
431
+                $name = $node->getAttribute('name');
432
+                if (!$name) {
433
+                    $name = $node->getAttribute('property');
434
+                }
435
+                if (!$name) {
436
+                    continue;
437
+                }
438
+
439
+                $name = strtolower($name);
440
+
441
+                $content = $node->getAttribute('content');
442
+                if (isset($meta['metatags'][$name])) {
443
+                    if (!is_array($meta['metatags'][$name])) {
444
+                        $meta['metatags'][$name] = array($meta['metatags'][$name]);
445
+                    }
446
+                    $meta['metatags'][$name][] = $content;
447
+                } else {
448
+                    $meta['metatags'][$name] = $content;
449
+                }
450
+
451
+                switch ($name) {
452
+
453
+                    case 'title' :
454
+                    case 'og:title' :
455
+                    case 'twitter:title' :
456
+                        if (empty($meta['title'])) {
457
+                            $meta['title'] = $content;
458
+                        }
459
+                        break;
460
+
461
+                    case 'og:type' :
462
+                        if (empty($meta['type'])) {
463
+                            $meta['type'] = $content;
464
+                        }
465
+                        break;
466
+
467
+                    case 'description' :
468
+                    case 'og:description' :
469
+                    case 'twitter:description' :
470
+                        if (empty($meta['description'])) {
471
+                            $meta['description'] = $content;
472
+                        }
473
+                        break;
474
+
475
+                    case 'keywords' :
476
+                        if (is_string($content)) {
477
+                            $content = explode(',', $content);
478
+                            $content = array_map('trim', $content);
479
+                        }
480
+                        $meta['tags'] = $content;
481
+                        break;
482
+
483
+                    case 'og:site_name' :
484
+                    case 'twitter:site' :
485
+                        if (empty($meta['provider_name'])) {
486
+                            $meta['provider_name'] = $content;
487
+                        }
488
+                        break;
489
+
490
+                    case 'og:image' :
491
+                    case 'twitter:image' :
492
+                        $image_url = $this->getAbsoluteURL($doc, $content);
493
+                        if ($this->isImage($image_url)) {
494
+                            $meta['thumbnails'][] = $image_url;
495
+                        }
496
+                        break;
497
+                }
498
+            }
499
+        }
500
+
501
+        return $meta;
502
+    }
503
+
504
+    /**
505
+     * Parses <img> tags
506
+     *
507
+     * @param DOMDocument $doc Document
508
+     * @return array
509
+     */
510
+    public function parseImgTags(DOMDocument $doc) {
511
+
512
+        $meta = array();
513
+
514
+        $nodes = $doc->getElementsByTagName('img');
515
+        foreach ($nodes as $node) {
516
+            $src = $node->getAttribute('src');
517
+            $image_url = $this->getAbsoluteURL($doc, $src);
518
+            if ($this->isImage($image_url)) {
519
+                $meta['thumbnails'][] = $image_url;
520
+            }
521
+        }
522
+
523
+        return $meta;
524
+    }
525
+
526
+    /**
527
+     * Normalizes relative URLs
528
+     *
529
+     * @param DOMDocument $doc  Document
530
+     * @param string      $href URL to normalize
531
+     * @return string|false
532
+     */
533
+    public function getAbsoluteURL(DOMDocument $doc, $href = '') {
534
+
535
+        if (preg_match("/^data:/i", $href)) {
536
+            // data URIs can not be resolved
537
+            return false;
538
+        }
539
+
540
+        // Check if $url is absolute
541
+        if (parse_url($href, PHP_URL_HOST)) {
542
+            return $href;
543
+        }
544
+
545
+        $uri = trim($doc->documentURI ?: '', '/');
546
+
547
+        // Check if $url is relative to root
548
+        if (substr($href, 0, 1) === "/") {
549
+            $scheme = parse_url($uri, PHP_URL_SCHEME);
550
+            $host = parse_url($uri, PHP_URL_HOST);
551
+            return "$scheme://$host$href";
552
+        }
553
+
554
+        // $url is relative to page
555
+        $uri = pathinfo($uri, PATHINFO_DIRNAME);
556
+        return "$uri/$href";
557
+    }
558 558
 
559 559
 }
Please login to merge, or discard this patch.
Spacing   +1 added lines, -1 removed lines patch added patch discarded remove patch
@@ -297,7 +297,7 @@
 block discarded – undo
297 297
 	public function isImage($url = '') {
298 298
 		$mime = $this->getContentType($url);
299 299
 		if ($mime) {
300
-			list($simple, ) = explode('/', $mime);
300
+			list($simple,) = explode('/', $mime);
301 301
 			return ($simple == 'image');
302 302
 		}
303 303
 
Please login to merge, or discard this patch.