Passed
Push — master ( 2ef246...0a1acd )
by Ismayil
03:00
created
src/hypeJunction/Parser.php 1 patch
Indentation   +499 added lines, -499 removed lines patch added patch discarded remove patch
@@ -12,509 +12,509 @@
 block discarded – undo
12 12
  */
13 13
 class Parser {
14 14
 
15
-	/**
16
-	 * @var ClientInterface
17
-	 */
18
-	private $client;
19
-
20
-	/**
21
-	 * @var array
22
-	 */
23
-	private static $cache;
24
-
25
-	/**
26
-	 * Constructor
27
-	 * @param ClientInterface $client HTTP Client
28
-	 */
29
-	public function __construct(ClientInterface $client) {
30
-		$this->client = $client;
31
-	}
32
-
33
-	/**
34
-	 * Parses a URL into a an array of metatags
35
-	 *
36
-	 * @param string $url URL to parse
37
-	 * @return array
38
-	 */
39
-	public function parse($url = '') {
40
-
41
-		$data = $this->getImageData($url);
42
-		if (!$data) {
43
-			$data = $this->getOEmbedData($url);
44
-		}
45
-		if (!$data) {
46
-			$data = $this->getDOMData($url);
47
-			if (is_array($data) && !empty($data['oembed_url'])) {
48
-				foreach ($data['oembed_url'] as $oembed_url) {
49
-					$oembed_data = $this->parse($oembed_url);
50
-					if (!empty($oembed_data) && is_array($oembed_data)) {
51
-						$oembed_data['oembed_url'] = $oembed_data['url'];
52
-						unset($oembed_data['url']);
53
-						$data = array_merge($data, $oembed_data);
54
-					}
55
-				}
56
-			}
57
-		}
15
+    /**
16
+     * @var ClientInterface
17
+     */
18
+    private $client;
19
+
20
+    /**
21
+     * @var array
22
+     */
23
+    private static $cache;
24
+
25
+    /**
26
+     * Constructor
27
+     * @param ClientInterface $client HTTP Client
28
+     */
29
+    public function __construct(ClientInterface $client) {
30
+        $this->client = $client;
31
+    }
32
+
33
+    /**
34
+     * Parses a URL into a an array of metatags
35
+     *
36
+     * @param string $url URL to parse
37
+     * @return array
38
+     */
39
+    public function parse($url = '') {
40
+
41
+        $data = $this->getImageData($url);
42
+        if (!$data) {
43
+            $data = $this->getOEmbedData($url);
44
+        }
45
+        if (!$data) {
46
+            $data = $this->getDOMData($url);
47
+            if (is_array($data) && !empty($data['oembed_url'])) {
48
+                foreach ($data['oembed_url'] as $oembed_url) {
49
+                    $oembed_data = $this->parse($oembed_url);
50
+                    if (!empty($oembed_data) && is_array($oembed_data)) {
51
+                        $oembed_data['oembed_url'] = $oembed_data['url'];
52
+                        unset($oembed_data['url']);
53
+                        $data = array_merge($data, $oembed_data);
54
+                    }
55
+                }
56
+            }
57
+        }
58 58
 	
59
-		if (!is_array($data)) {
60
-			$data = array();
61
-		}
62
-
63
-		if (empty($data['thumbnail_url']) && !empty($data['thumbnails'])) {
64
-			$data['thumbnail_url'] = $data['thumbnails'][0];
65
-		}
66
-
67
-		return $data;
68
-	}
69
-
70
-	/**
71
-	 * Parses image metatags
72
-	 *
73
-	 * @param string $url URL of the image
74
-	 * @return array|false
75
-	 */
76
-	public function getImageData($url = '') {
77
-		if (!$this->isImage($url)) {
78
-			return false;
79
-		}
80
-
81
-		return array(
82
-			'type' => 'photo',
83
-			'url' => $url,
84
-			'thumbnails' => array($url),
85
-		);
86
-	}
87
-
88
-	/**
89
-	 * Parses OEmbed data
90
-	 *
91
-	 * @param  string $url URL of the image
92
-	 * @return array|false
93
-	 */
94
-	public function getOEmbedData($url = '') {
95
-
96
-		if (!$this->isJSON($url) && !$this->isXML($url)) {
97
-			return false;
98
-		}
99
-
100
-		$meta = array(
101
-			'url' => $url,
102
-		);
103
-
104
-		$content = $this->read($url);
105
-		if (!$content) {
106
-			return $meta;
107
-		}
108
-
109
-		$data = new \stdClass();
110
-		if ($this->isJSON($url)) {
111
-			$data = json_decode($content);
112
-		} else if ($this->isXML($url)) {
113
-			$data = simplexml_load_string($content);
114
-		}
115
-
116
-		$props = array(
117
-			'type',
118
-			'version',
119
-			'title',
120
-			'author_name',
121
-			'author_url',
122
-			'provider_name',
123
-			'provider_url',
124
-			'cache_age',
125
-			'thumbnail_url',
126
-			'thumbnail_width',
127
-			'thumbnail_height',
128
-			'width',
129
-			'height',
130
-			'html',
131
-		);
132
-		foreach ($props as $key) {
133
-			if (!empty($data->$key)) {
134
-				$meta[$key] = (string) $data->$key;
135
-			}
136
-		}
137
-		return $meta;
138
-	}
139
-
140
-	/**
141
-	 * Parses metatags from DOM
142
-	 *
143
-	 * @param  string $url URL
144
-	 * @return array|false
145
-	 */
146
-	public function getDOMData($url = '') {
147
-
148
-		if (!$this->isHTML($url)) {
149
-			return false;
150
-		}
151
-
152
-		$doc = $this->getDOM($url);
153
-		if (!$doc) {
154
-			return false;
155
-		}
59
+        if (!is_array($data)) {
60
+            $data = array();
61
+        }
62
+
63
+        if (empty($data['thumbnail_url']) && !empty($data['thumbnails'])) {
64
+            $data['thumbnail_url'] = $data['thumbnails'][0];
65
+        }
66
+
67
+        return $data;
68
+    }
69
+
70
+    /**
71
+     * Parses image metatags
72
+     *
73
+     * @param string $url URL of the image
74
+     * @return array|false
75
+     */
76
+    public function getImageData($url = '') {
77
+        if (!$this->isImage($url)) {
78
+            return false;
79
+        }
80
+
81
+        return array(
82
+            'type' => 'photo',
83
+            'url' => $url,
84
+            'thumbnails' => array($url),
85
+        );
86
+    }
87
+
88
+    /**
89
+     * Parses OEmbed data
90
+     *
91
+     * @param  string $url URL of the image
92
+     * @return array|false
93
+     */
94
+    public function getOEmbedData($url = '') {
95
+
96
+        if (!$this->isJSON($url) && !$this->isXML($url)) {
97
+            return false;
98
+        }
99
+
100
+        $meta = array(
101
+            'url' => $url,
102
+        );
103
+
104
+        $content = $this->read($url);
105
+        if (!$content) {
106
+            return $meta;
107
+        }
108
+
109
+        $data = new \stdClass();
110
+        if ($this->isJSON($url)) {
111
+            $data = json_decode($content);
112
+        } else if ($this->isXML($url)) {
113
+            $data = simplexml_load_string($content);
114
+        }
115
+
116
+        $props = array(
117
+            'type',
118
+            'version',
119
+            'title',
120
+            'author_name',
121
+            'author_url',
122
+            'provider_name',
123
+            'provider_url',
124
+            'cache_age',
125
+            'thumbnail_url',
126
+            'thumbnail_width',
127
+            'thumbnail_height',
128
+            'width',
129
+            'height',
130
+            'html',
131
+        );
132
+        foreach ($props as $key) {
133
+            if (!empty($data->$key)) {
134
+                $meta[$key] = (string) $data->$key;
135
+            }
136
+        }
137
+        return $meta;
138
+    }
139
+
140
+    /**
141
+     * Parses metatags from DOM
142
+     *
143
+     * @param  string $url URL
144
+     * @return array|false
145
+     */
146
+    public function getDOMData($url = '') {
147
+
148
+        if (!$this->isHTML($url)) {
149
+            return false;
150
+        }
151
+
152
+        $doc = $this->getDOM($url);
153
+        if (!$doc) {
154
+            return false;
155
+        }
156 156
 		
157
-		$defaults = array(
158
-			'url' => $url,
159
-		);
157
+        $defaults = array(
158
+            'url' => $url,
159
+        );
160 160
 
161
-		$link_tags = $this->parseLinkTags($doc);
162
-		$meta_tags = $this->parseMetaTags($doc);
163
-		$img_tags = $this->parseImgTags($doc);
161
+        $link_tags = $this->parseLinkTags($doc);
162
+        $meta_tags = $this->parseMetaTags($doc);
163
+        $img_tags = $this->parseImgTags($doc);
164 164
 
165
-		$meta = array_merge_recursive($defaults, $link_tags, $meta_tags, $img_tags);
165
+        $meta = array_merge_recursive($defaults, $link_tags, $meta_tags, $img_tags);
166 166
 		
167
-		if (empty($meta['title'])) {
168
-			$meta['title'] = $this->parseTitle($doc);
169
-		}
170
-
171
-
172
-		return $meta;
173
-	}
174
-
175
-	/**
176
-	 * Check if URL exists and is reachable by making an HTTP request to retrieve header information
177
-	 *
178
-	 * @param string $url URL of the resource
179
-	 * @return boolean
180
-	 */
181
-	public function exists($url = '') {
182
-		$response = $this->request($url);
183
-		if ($response instanceof Response) {
184
-			return $response->getStatusCode() == 200;
185
-		}
186
-		return false;
187
-	}
188
-
189
-	/**
190
-	 * Returns head of the resource
191
-	 *
192
-	 * @param string $url URL of the resource
193
-	 * @return Response|false
194
-	 */
195
-	public function request($url = '') {
196
-		if (!filter_var($url, FILTER_VALIDATE_URL)) {
197
-			return false;
198
-		}
199
-		if (!isset(self::$cache[$url])) {
200
-			try {
201
-				$response = $this->client->request('GET', $url);
202
-			} catch (Exception $e) {
203
-				$response = false;
204
-				error_log("Parser Error for HEAD request ($url): {$e->getMessage()}");
205
-			}
206
-			self::$cache[$url] = $response;
207
-		}
208
-
209
-		return self::$cache[$url];
210
-	}
211
-
212
-	/**
213
-	 * Get contents of the page
214
-	 *
215
-	 * @param string $url URL of the resource
216
-	 * @return string
217
-	 */
218
-	public function read($url = '') {
219
-		$body = '';
220
-		if (!$this->exists($url)) {
221
-			return $body;
222
-		}
223
-
224
-		$response = $this->request($url);
225
-		$body = (string) $response->getBody();
226
-		return $body;
227
-	}
228
-
229
-	/**
230
-	 * Checks if resource is an html page
231
-	 *
232
-	 * @param string $url URL of the resource
233
-	 * @return boolean
234
-	 */
235
-	public function isHTML($url = '') {
236
-		$mime = $this->getContentType($url);
237
-		return strpos($mime, 'text/html') !== false;
238
-	}
239
-
240
-	/**
241
-	 * Checks if resource is JSON
242
-	 *
243
-	 * @param string $url URL of the resource
244
-	 * @return boolean
245
-	 */
246
-	public function isJSON($url = '') {
247
-		$mime = $this->getContentType($url);
248
-		return strpos($mime, 'json') !== false;
249
-	}
250
-
251
-	/**
252
-	 * Checks if resource is XML
253
-	 *
254
-	 * @param string $url URL of the resource
255
-	 * @return boolean
256
-	 */
257
-	public function isXML($url = '') {
258
-		$mime = $this->getContentType($url);
259
-		return strpos($mime, 'xml') !== false;
260
-	}
261
-
262
-	/**
263
-	 * Checks if resource is an image
264
-	 *
265
-	 * @param string $url URL of the resource
266
-	 * @return boolean
267
-	 */
268
-	public function isImage($url = '') {
269
-		$mime = $this->getContentType($url);
270
-		if ($mime) {
271
-			list($simple,) = explode('/', $mime);
272
-			return ($simple == 'image');
273
-		}
274
-
275
-		return false;
276
-	}
277
-
278
-	/**
279
-	 * Get mime type of the URL content
280
-	 *
281
-	 * @param string $url URL of the resource
282
-	 * @return string
283
-	 */
284
-	public function getContentType($url = '') {
285
-		$response = $this->request($url);
286
-		if ($response instanceof Response) {
287
-			$header = $response->getHeader('Content-Type');
288
-			if (is_array($header) && !empty($header)) {
289
-				$parts = explode(';', $header[0]);
290
-				return trim($parts[0]);
291
-			}
292
-		}
293
-		return '';
294
-	}
295
-
296
-	/**
297
-	 * Returns HTML contents of the page
298
-	 *
299
-	 * @param string $url URL of the resource
300
-	 * @return string
301
-	 */
302
-	public function getHTML($url = '') {
303
-		if (!$this->isHTML($url)) {
304
-			return '';
305
-		}
306
-		return $this->read($url);
307
-	}
308
-
309
-	/**
310
-	 * Returns HTML contents of the page as a DOMDocument
311
-	 *
312
-	 * @param string $url URL of the resource
313
-	 * @return DOMDocument|false
314
-	 */
315
-	public function getDOM($url = '') {
316
-		$html = $this->getHTML($url);
317
-		if (empty($html)) {
318
-			return false;
319
-		}
320
-		$doc = new DOMDocument();
321
-		if (is_callable('mb_convert_encoding')) {
322
-			$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
323
-		} else {
324
-			$doc->loadHTML($html);
325
-		}
326
-		if (!$doc->documentURI) {
327
-			$doc->documentURI = $url;
328
-		}
329
-		return $doc;
330
-	}
331
-
332
-	/**
333
-	 * Parses document title
334
-	 *
335
-	 * @param DOMDocument $doc Document
336
-	 * @return string
337
-	 */
338
-	public function parseTitle(DOMDocument $doc) {
339
-		$node = $doc->getElementsByTagName('title');
340
-		$title = $node->item(0)->nodeValue;
341
-		return ($title) ?: '';
342
-	}
343
-
344
-	/**
345
-	 * Parses <link> tags
346
-	 *
347
-	 * @param DOMDocument $doc Document
348
-	 * @return array
349
-	 */
350
-	public function parseLinkTags(DOMDocument $doc) {
351
-
352
-		$meta = array();
353
-
354
-		$nodes = $doc->getElementsByTagName('link');
355
-		foreach ($nodes as $node) {
356
-			$rel = $node->getAttribute('rel');
357
-			$href = $node->getAttribute('href');
358
-
359
-			switch ($rel) {
360
-
361
-				case 'icon' :
362
-					$meta['icons'][] = $this->getAbsoluteURL($doc, $href);
363
-					break;
364
-
365
-				case 'canonical' :
366
-					$meta['canonical'] = $this->getAbsoluteURL($doc, $href);
367
-					break;
368
-
369
-				case 'alternate' :
370
-					$type = $node->getAttribute('type');
371
-					if (in_array($type, array(
372
-								'application/json+oembed',
373
-								'text/json+oembed',
374
-								'application/xml+oembed',
375
-								'text/xml+oembed'
376
-							))) {
377
-						$meta['oembed_url'][] = $this->getAbsoluteURL($doc, $href);
378
-					}
379
-					break;
380
-			}
381
-		}
382
-
383
-		return $meta;
384
-	}
385
-
386
-	/**
387
-	 * Parses <meta> tags
388
-	 *
389
-	 * @param DOMDocument $doc Document
390
-	 * @return array
391
-	 */
392
-	public function parseMetaTags(DOMDocument $doc) {
393
-
394
-		$meta = array();
395
-
396
-		$nodes = $doc->getElementsByTagName('meta');
397
-		if (!empty($nodes)) {
398
-			foreach ($nodes as $node) {
399
-				$name = $node->getAttribute('name');
400
-				if (!$name) {
401
-					$name = $node->getAttribute('property');
402
-				}
403
-				if (!$name) {
404
-					continue;
405
-				}
406
-
407
-				$name = strtolower($name);
408
-
409
-				$content = $node->getAttribute('content');
410
-				if (isset($meta['metatags'][$name])) {
411
-					if (!is_array($meta['metatags'][$name])) {
412
-						$meta['metatags'][$name] = array($meta['metatags'][$name]);
413
-					}
414
-					$meta['metatags'][$name][] = $content;
415
-				} else {
416
-					$meta['metatags'][$name] = $content;
417
-				}
418
-
419
-				switch ($name) {
420
-
421
-					case 'title' :
422
-					case 'og:title' :
423
-					case 'twitter:title' :
424
-						if (empty($meta['title'])) {
425
-							$meta['title'] = $content;
426
-						}
427
-						break;
428
-
429
-					case 'og:type' :
430
-						if (empty($meta['type'])) {
431
-							$meta['type'] = $content;
432
-						}
433
-						break;
434
-
435
-					case 'description' :
436
-					case 'og:description' :
437
-					case 'twitter:description' :
438
-						if (empty($meta['description'])) {
439
-							$meta['description'] = $content;
440
-						}
441
-						break;
442
-
443
-					case 'keywords' :
444
-						if (is_string($content)) {
445
-							$content = explode(',', $content);
446
-							$content = array_map('trim', $content);
447
-						}
448
-						$meta['tags'] = $content;
449
-						break;
450
-
451
-					case 'og:site_name' :
452
-					case 'twitter:site' :
453
-						if (empty($meta['provider_name'])) {
454
-							$meta['provider_name'] = $content;
455
-						}
456
-						break;
457
-
458
-					case 'og:image' :
459
-					case 'twitter:image' :
460
-						$meta['thumbnails'][] = $this->getAbsoluteURL($doc, $content);
461
-						break;
462
-				}
463
-			}
464
-		}
465
-
466
-		return $meta;
467
-	}
468
-
469
-	/**
470
-	 * Parses <img> tags
471
-	 *
472
-	 * @param DOMDocument $doc Document
473
-	 * @return array
474
-	 */
475
-	public function parseImgTags(DOMDocument $doc) {
476
-
477
-		$meta = array();
478
-
479
-		$nodes = $doc->getElementsByTagName('img');
480
-		foreach ($nodes as $node) {
481
-			$src = $node->getAttribute('src');
482
-			$meta['thumbnails'][] = $this->getAbsoluteURL($doc, $src);
483
-		}
484
-
485
-		return $meta;
486
-	}
487
-
488
-	/**
489
-	 * Normalizes relative URLs
490
-	 *
491
-	 * @param DOMDocument $doc  Document
492
-	 * @param string      $href URL to normalize
493
-	 * @return string|false
494
-	 */
495
-	public function getAbsoluteURL(DOMDocument $doc, $href = '') {
496
-
497
-		if (preg_match("/^data:/i", $href)) {
498
-			// data URIs can not be resolved
499
-			return false;
500
-		}
501
-
502
-		// Check if $url is absolute
503
-		if (parse_url($href, PHP_URL_HOST)) {
504
-			return $href;
505
-		}
506
-
507
-		$uri = trim($doc->documentURI ?: '', '/');
508
-
509
-		// Check if $url is relative to root
510
-		if (substr($href, 0, 1) === "/") {
511
-			$scheme = parse_url($uri, PHP_URL_SCHEME);
512
-			$host = parse_url($uri, PHP_URL_HOST);
513
-			return "$scheme://$host$href";
514
-		}
515
-
516
-		// $url is relative to page
517
-		return "$uri/$href";
518
-	}
167
+        if (empty($meta['title'])) {
168
+            $meta['title'] = $this->parseTitle($doc);
169
+        }
170
+
171
+
172
+        return $meta;
173
+    }
174
+
175
+    /**
176
+     * Check if URL exists and is reachable by making an HTTP request to retrieve header information
177
+     *
178
+     * @param string $url URL of the resource
179
+     * @return boolean
180
+     */
181
+    public function exists($url = '') {
182
+        $response = $this->request($url);
183
+        if ($response instanceof Response) {
184
+            return $response->getStatusCode() == 200;
185
+        }
186
+        return false;
187
+    }
188
+
189
+    /**
190
+     * Returns head of the resource
191
+     *
192
+     * @param string $url URL of the resource
193
+     * @return Response|false
194
+     */
195
+    public function request($url = '') {
196
+        if (!filter_var($url, FILTER_VALIDATE_URL)) {
197
+            return false;
198
+        }
199
+        if (!isset(self::$cache[$url])) {
200
+            try {
201
+                $response = $this->client->request('GET', $url);
202
+            } catch (Exception $e) {
203
+                $response = false;
204
+                error_log("Parser Error for HEAD request ($url): {$e->getMessage()}");
205
+            }
206
+            self::$cache[$url] = $response;
207
+        }
208
+
209
+        return self::$cache[$url];
210
+    }
211
+
212
+    /**
213
+     * Get contents of the page
214
+     *
215
+     * @param string $url URL of the resource
216
+     * @return string
217
+     */
218
+    public function read($url = '') {
219
+        $body = '';
220
+        if (!$this->exists($url)) {
221
+            return $body;
222
+        }
223
+
224
+        $response = $this->request($url);
225
+        $body = (string) $response->getBody();
226
+        return $body;
227
+    }
228
+
229
+    /**
230
+     * Checks if resource is an html page
231
+     *
232
+     * @param string $url URL of the resource
233
+     * @return boolean
234
+     */
235
+    public function isHTML($url = '') {
236
+        $mime = $this->getContentType($url);
237
+        return strpos($mime, 'text/html') !== false;
238
+    }
239
+
240
+    /**
241
+     * Checks if resource is JSON
242
+     *
243
+     * @param string $url URL of the resource
244
+     * @return boolean
245
+     */
246
+    public function isJSON($url = '') {
247
+        $mime = $this->getContentType($url);
248
+        return strpos($mime, 'json') !== false;
249
+    }
250
+
251
+    /**
252
+     * Checks if resource is XML
253
+     *
254
+     * @param string $url URL of the resource
255
+     * @return boolean
256
+     */
257
+    public function isXML($url = '') {
258
+        $mime = $this->getContentType($url);
259
+        return strpos($mime, 'xml') !== false;
260
+    }
261
+
262
+    /**
263
+     * Checks if resource is an image
264
+     *
265
+     * @param string $url URL of the resource
266
+     * @return boolean
267
+     */
268
+    public function isImage($url = '') {
269
+        $mime = $this->getContentType($url);
270
+        if ($mime) {
271
+            list($simple,) = explode('/', $mime);
272
+            return ($simple == 'image');
273
+        }
274
+
275
+        return false;
276
+    }
277
+
278
+    /**
279
+     * Get mime type of the URL content
280
+     *
281
+     * @param string $url URL of the resource
282
+     * @return string
283
+     */
284
+    public function getContentType($url = '') {
285
+        $response = $this->request($url);
286
+        if ($response instanceof Response) {
287
+            $header = $response->getHeader('Content-Type');
288
+            if (is_array($header) && !empty($header)) {
289
+                $parts = explode(';', $header[0]);
290
+                return trim($parts[0]);
291
+            }
292
+        }
293
+        return '';
294
+    }
295
+
296
+    /**
297
+     * Returns HTML contents of the page
298
+     *
299
+     * @param string $url URL of the resource
300
+     * @return string
301
+     */
302
+    public function getHTML($url = '') {
303
+        if (!$this->isHTML($url)) {
304
+            return '';
305
+        }
306
+        return $this->read($url);
307
+    }
308
+
309
+    /**
310
+     * Returns HTML contents of the page as a DOMDocument
311
+     *
312
+     * @param string $url URL of the resource
313
+     * @return DOMDocument|false
314
+     */
315
+    public function getDOM($url = '') {
316
+        $html = $this->getHTML($url);
317
+        if (empty($html)) {
318
+            return false;
319
+        }
320
+        $doc = new DOMDocument();
321
+        if (is_callable('mb_convert_encoding')) {
322
+            $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
323
+        } else {
324
+            $doc->loadHTML($html);
325
+        }
326
+        if (!$doc->documentURI) {
327
+            $doc->documentURI = $url;
328
+        }
329
+        return $doc;
330
+    }
331
+
332
+    /**
333
+     * Parses document title
334
+     *
335
+     * @param DOMDocument $doc Document
336
+     * @return string
337
+     */
338
+    public function parseTitle(DOMDocument $doc) {
339
+        $node = $doc->getElementsByTagName('title');
340
+        $title = $node->item(0)->nodeValue;
341
+        return ($title) ?: '';
342
+    }
343
+
344
+    /**
345
+     * Parses <link> tags
346
+     *
347
+     * @param DOMDocument $doc Document
348
+     * @return array
349
+     */
350
+    public function parseLinkTags(DOMDocument $doc) {
351
+
352
+        $meta = array();
353
+
354
+        $nodes = $doc->getElementsByTagName('link');
355
+        foreach ($nodes as $node) {
356
+            $rel = $node->getAttribute('rel');
357
+            $href = $node->getAttribute('href');
358
+
359
+            switch ($rel) {
360
+
361
+                case 'icon' :
362
+                    $meta['icons'][] = $this->getAbsoluteURL($doc, $href);
363
+                    break;
364
+
365
+                case 'canonical' :
366
+                    $meta['canonical'] = $this->getAbsoluteURL($doc, $href);
367
+                    break;
368
+
369
+                case 'alternate' :
370
+                    $type = $node->getAttribute('type');
371
+                    if (in_array($type, array(
372
+                                'application/json+oembed',
373
+                                'text/json+oembed',
374
+                                'application/xml+oembed',
375
+                                'text/xml+oembed'
376
+                            ))) {
377
+                        $meta['oembed_url'][] = $this->getAbsoluteURL($doc, $href);
378
+                    }
379
+                    break;
380
+            }
381
+        }
382
+
383
+        return $meta;
384
+    }
385
+
386
+    /**
387
+     * Parses <meta> tags
388
+     *
389
+     * @param DOMDocument $doc Document
390
+     * @return array
391
+     */
392
+    public function parseMetaTags(DOMDocument $doc) {
393
+
394
+        $meta = array();
395
+
396
+        $nodes = $doc->getElementsByTagName('meta');
397
+        if (!empty($nodes)) {
398
+            foreach ($nodes as $node) {
399
+                $name = $node->getAttribute('name');
400
+                if (!$name) {
401
+                    $name = $node->getAttribute('property');
402
+                }
403
+                if (!$name) {
404
+                    continue;
405
+                }
406
+
407
+                $name = strtolower($name);
408
+
409
+                $content = $node->getAttribute('content');
410
+                if (isset($meta['metatags'][$name])) {
411
+                    if (!is_array($meta['metatags'][$name])) {
412
+                        $meta['metatags'][$name] = array($meta['metatags'][$name]);
413
+                    }
414
+                    $meta['metatags'][$name][] = $content;
415
+                } else {
416
+                    $meta['metatags'][$name] = $content;
417
+                }
418
+
419
+                switch ($name) {
420
+
421
+                    case 'title' :
422
+                    case 'og:title' :
423
+                    case 'twitter:title' :
424
+                        if (empty($meta['title'])) {
425
+                            $meta['title'] = $content;
426
+                        }
427
+                        break;
428
+
429
+                    case 'og:type' :
430
+                        if (empty($meta['type'])) {
431
+                            $meta['type'] = $content;
432
+                        }
433
+                        break;
434
+
435
+                    case 'description' :
436
+                    case 'og:description' :
437
+                    case 'twitter:description' :
438
+                        if (empty($meta['description'])) {
439
+                            $meta['description'] = $content;
440
+                        }
441
+                        break;
442
+
443
+                    case 'keywords' :
444
+                        if (is_string($content)) {
445
+                            $content = explode(',', $content);
446
+                            $content = array_map('trim', $content);
447
+                        }
448
+                        $meta['tags'] = $content;
449
+                        break;
450
+
451
+                    case 'og:site_name' :
452
+                    case 'twitter:site' :
453
+                        if (empty($meta['provider_name'])) {
454
+                            $meta['provider_name'] = $content;
455
+                        }
456
+                        break;
457
+
458
+                    case 'og:image' :
459
+                    case 'twitter:image' :
460
+                        $meta['thumbnails'][] = $this->getAbsoluteURL($doc, $content);
461
+                        break;
462
+                }
463
+            }
464
+        }
465
+
466
+        return $meta;
467
+    }
468
+
469
+    /**
470
+     * Parses <img> tags
471
+     *
472
+     * @param DOMDocument $doc Document
473
+     * @return array
474
+     */
475
+    public function parseImgTags(DOMDocument $doc) {
476
+
477
+        $meta = array();
478
+
479
+        $nodes = $doc->getElementsByTagName('img');
480
+        foreach ($nodes as $node) {
481
+            $src = $node->getAttribute('src');
482
+            $meta['thumbnails'][] = $this->getAbsoluteURL($doc, $src);
483
+        }
484
+
485
+        return $meta;
486
+    }
487
+
488
+    /**
489
+     * Normalizes relative URLs
490
+     *
491
+     * @param DOMDocument $doc  Document
492
+     * @param string      $href URL to normalize
493
+     * @return string|false
494
+     */
495
+    public function getAbsoluteURL(DOMDocument $doc, $href = '') {
496
+
497
+        if (preg_match("/^data:/i", $href)) {
498
+            // data URIs can not be resolved
499
+            return false;
500
+        }
501
+
502
+        // Check if $url is absolute
503
+        if (parse_url($href, PHP_URL_HOST)) {
504
+            return $href;
505
+        }
506
+
507
+        $uri = trim($doc->documentURI ?: '', '/');
508
+
509
+        // Check if $url is relative to root
510
+        if (substr($href, 0, 1) === "/") {
511
+            $scheme = parse_url($uri, PHP_URL_SCHEME);
512
+            $host = parse_url($uri, PHP_URL_HOST);
513
+            return "$scheme://$host$href";
514
+        }
515
+
516
+        // $url is relative to page
517
+        return "$uri/$href";
518
+    }
519 519
 
520 520
 }
Please login to merge, or discard this patch.