@@ -12,565 +12,565 @@ |
||
12 | 12 | */ |
13 | 13 | class Parser { |
14 | 14 | |
15 | - /** |
|
16 | - * @var ClientInterface |
|
17 | - */ |
|
18 | - private $client; |
|
19 | - |
|
20 | - /** |
|
21 | - * @var array |
|
22 | - */ |
|
23 | - private static $cache; |
|
24 | - |
|
25 | - /** |
|
26 | - * Constructor |
|
27 | - * @param ClientInterface $client HTTP Client |
|
28 | - */ |
|
29 | - public function __construct(ClientInterface $client) { |
|
30 | - $this->client = $client; |
|
31 | - } |
|
32 | - |
|
33 | - /** |
|
34 | - * Parses a URL into a an array of metatags |
|
35 | - * |
|
36 | - * @param string $url URL to parse |
|
37 | - * @return array |
|
38 | - */ |
|
39 | - public function parse($url = '') { |
|
40 | - |
|
41 | - $data = $this->getImageData($url); |
|
42 | - if (!$data) { |
|
43 | - $data = $this->getOEmbedData($url); |
|
44 | - } |
|
45 | - if (!$data) { |
|
46 | - $data = $this->getDOMData($url); |
|
47 | - if (is_array($data) && !empty($data['oembed_url'])) { |
|
48 | - foreach ($data['oembed_url'] as $oembed_url) { |
|
49 | - $oembed_data = $this->parse($oembed_url); |
|
50 | - if (!empty($oembed_data) && is_array($oembed_data)) { |
|
51 | - $oembed_data['oembed_url'] = $oembed_data['url']; |
|
52 | - unset($oembed_data['url']); |
|
53 | - $data = array_merge($data, $oembed_data); |
|
54 | - } |
|
55 | - } |
|
56 | - } |
|
57 | - } |
|
58 | - |
|
59 | - if (!is_array($data)) { |
|
60 | - $data = array(); |
|
61 | - } |
|
62 | - |
|
63 | - if (empty($data['thumbnail_url']) && !empty($data['thumbnails'])) { |
|
64 | - $data['thumbnail_url'] = $data['thumbnails'][0]; |
|
65 | - } |
|
66 | - |
|
67 | - return $data; |
|
68 | - } |
|
69 | - |
|
70 | - /** |
|
71 | - * Parses image metatags |
|
72 | - * |
|
73 | - * @param string $url URL of the image |
|
74 | - * @return array|false |
|
75 | - */ |
|
76 | - public function getImageData($url = '') { |
|
77 | - if (!$this->isImage($url)) { |
|
78 | - return false; |
|
79 | - } |
|
80 | - |
|
81 | - return array( |
|
82 | - 'type' => 'photo', |
|
83 | - 'url' => $url, |
|
84 | - 'thumbnails' => array($url), |
|
85 | - ); |
|
86 | - } |
|
87 | - |
|
88 | - /** |
|
89 | - * Parses OEmbed data |
|
90 | - * |
|
91 | - * @param string $url URL of the image |
|
92 | - * @return array|false |
|
93 | - */ |
|
94 | - public function getOEmbedData($url = '') { |
|
95 | - |
|
96 | - if (!$this->isJSON($url) && !$this->isXML($url)) { |
|
97 | - return false; |
|
98 | - } |
|
99 | - |
|
100 | - $meta = array( |
|
101 | - 'url' => $url, |
|
102 | - ); |
|
103 | - |
|
104 | - $content = $this->read($url); |
|
105 | - if (!$content) { |
|
106 | - return $meta; |
|
107 | - } |
|
108 | - |
|
109 | - $data = new \stdClass(); |
|
110 | - if ($this->isJSON($url)) { |
|
111 | - $data = json_decode($content); |
|
112 | - } else if ($this->isXML($url)) { |
|
113 | - $data = simplexml_load_string($content); |
|
114 | - } |
|
115 | - |
|
116 | - $props = array( |
|
117 | - 'type', |
|
118 | - 'version', |
|
119 | - 'title', |
|
120 | - 'author_name', |
|
121 | - 'author_url', |
|
122 | - 'provider_name', |
|
123 | - 'provider_url', |
|
124 | - 'cache_age', |
|
125 | - 'thumbnail_url', |
|
126 | - 'thumbnail_width', |
|
127 | - 'thumbnail_height', |
|
128 | - 'width', |
|
129 | - 'height', |
|
130 | - 'html', |
|
131 | - ); |
|
132 | - foreach ($props as $key) { |
|
133 | - if (!empty($data->$key)) { |
|
134 | - $meta[$key] = (string) $data->$key; |
|
135 | - } |
|
136 | - } |
|
137 | - return $meta; |
|
138 | - } |
|
139 | - |
|
140 | - /** |
|
141 | - * Parses metatags from DOM |
|
142 | - * |
|
143 | - * @param string $url URL |
|
144 | - * @return array|false |
|
145 | - */ |
|
146 | - public function getDOMData($url = '') { |
|
147 | - |
|
148 | - if (!$this->isHTML($url)) { |
|
149 | - return false; |
|
150 | - } |
|
151 | - |
|
152 | - $doc = $this->getDOM($url); |
|
153 | - if (!$doc) { |
|
154 | - return false; |
|
155 | - } |
|
156 | - |
|
157 | - $defaults = array( |
|
158 | - 'url' => $url, |
|
159 | - ); |
|
160 | - |
|
161 | - $link_tags = $this->parseLinkTags($doc); |
|
162 | - $meta_tags = $this->parseMetaTags($doc); |
|
163 | - $img_tags = $this->parseImgTags($doc); |
|
164 | - |
|
165 | - $meta = array_merge_recursive($defaults, $link_tags, $meta_tags, $img_tags); |
|
166 | - |
|
167 | - if (empty($meta['title'])) { |
|
168 | - $meta['title'] = $this->parseTitle($doc); |
|
169 | - } |
|
170 | - |
|
171 | - |
|
172 | - return $meta; |
|
173 | - } |
|
174 | - |
|
175 | - /** |
|
176 | - * Check if URL exists and is reachable by making an HTTP request to retrieve header information |
|
177 | - * |
|
178 | - * @param string $url URL of the resource |
|
179 | - * @return boolean |
|
180 | - */ |
|
181 | - public function exists($url = '') { |
|
182 | - $response = $this->request($url); |
|
183 | - if ($response instanceof Response) { |
|
184 | - return $response->getStatusCode() == 200; |
|
185 | - } |
|
186 | - return false; |
|
187 | - } |
|
188 | - |
|
189 | - /** |
|
190 | - * Validate URL |
|
191 | - * |
|
192 | - * @param string $url URL to validate |
|
193 | - * @return bool |
|
194 | - */ |
|
195 | - public function isValidUrl($url = '') { |
|
196 | - // based on http://php.net/manual/en/function.filter-var.php#104160 |
|
197 | - // adapted by @mrclay in https://github.com/mrclay/Elgg-leaf/blob/62bf31c0ccdaab549a7e585a4412443e09821db3/engine/lib/output.php |
|
198 | - $res = filter_var($url, FILTER_VALIDATE_URL); |
|
199 | - if ($res) { |
|
200 | - return $res; |
|
201 | - } |
|
202 | - // Check if it has unicode chars. |
|
203 | - $l = mb_strlen($url); |
|
204 | - if (strlen($url) == $l) { |
|
205 | - return $res; |
|
206 | - } |
|
207 | - // Replace wide chars by “X”. |
|
208 | - $s = ''; |
|
209 | - for ($i = 0; $i < $l; ++$i) { |
|
210 | - $ch = elgg_substr($url, $i, 1); |
|
211 | - $s .= (strlen($ch) > 1) ? 'X' : $ch; |
|
212 | - } |
|
213 | - // Re-check now. |
|
214 | - return filter_var($s, FILTER_VALIDATE_URL) ? $url : false; |
|
215 | - } |
|
216 | - |
|
217 | - /** |
|
218 | - * Returns head of the resource |
|
219 | - * |
|
220 | - * @param string $url URL of the resource |
|
221 | - * @return Response|false |
|
222 | - */ |
|
223 | - public function request($url = '') { |
|
224 | - $url = str_replace(' ', '%20', $url); |
|
225 | - if (!$this->isValidUrl($url)) { |
|
226 | - return false; |
|
227 | - } |
|
228 | - if (!isset(self::$cache[$url])) { |
|
229 | - try { |
|
230 | - $response = $this->client->request('GET', $url); |
|
231 | - } catch (Exception $e) { |
|
232 | - $response = false; |
|
233 | - error_log("Parser Error for HEAD request ($url): {$e->getMessage()}"); |
|
234 | - } |
|
235 | - self::$cache[$url] = $response; |
|
236 | - } |
|
237 | - |
|
238 | - return self::$cache[$url]; |
|
239 | - } |
|
240 | - |
|
241 | - /** |
|
242 | - * Get contents of the page |
|
243 | - * |
|
244 | - * @param string $url URL of the resource |
|
245 | - * @return string |
|
246 | - */ |
|
247 | - public function read($url = '') { |
|
248 | - $body = ''; |
|
249 | - if (!$this->exists($url)) { |
|
250 | - return $body; |
|
251 | - } |
|
252 | - |
|
253 | - $response = $this->request($url); |
|
254 | - $body = (string) $response->getBody(); |
|
255 | - return $body; |
|
256 | - } |
|
257 | - |
|
258 | - /** |
|
259 | - * Checks if resource is an html page |
|
260 | - * |
|
261 | - * @param string $url URL of the resource |
|
262 | - * @return boolean |
|
263 | - */ |
|
264 | - public function isHTML($url = '') { |
|
265 | - $mime = $this->getContentType($url); |
|
266 | - return strpos($mime, 'text/html') !== false; |
|
267 | - } |
|
268 | - |
|
269 | - /** |
|
270 | - * Checks if resource is JSON |
|
271 | - * |
|
272 | - * @param string $url URL of the resource |
|
273 | - * @return boolean |
|
274 | - */ |
|
275 | - public function isJSON($url = '') { |
|
276 | - $mime = $this->getContentType($url); |
|
277 | - return strpos($mime, 'json') !== false; |
|
278 | - } |
|
279 | - |
|
280 | - /** |
|
281 | - * Checks if resource is XML |
|
282 | - * |
|
283 | - * @param string $url URL of the resource |
|
284 | - * @return boolean |
|
285 | - */ |
|
286 | - public function isXML($url = '') { |
|
287 | - $mime = $this->getContentType($url); |
|
288 | - return strpos($mime, 'xml') !== false; |
|
289 | - } |
|
290 | - |
|
291 | - /** |
|
292 | - * Checks if resource is an image |
|
293 | - * |
|
294 | - * @param string $url URL of the resource |
|
295 | - * @return boolean |
|
296 | - */ |
|
297 | - public function isImage($url = '') { |
|
298 | - $mime = $this->getContentType($url); |
|
299 | - if ($mime) { |
|
300 | - list($simple, ) = explode('/', $mime); |
|
301 | - return ($simple == 'image'); |
|
302 | - } |
|
303 | - |
|
304 | - return false; |
|
305 | - } |
|
306 | - |
|
307 | - /** |
|
308 | - * Get mime type of the URL content |
|
309 | - * |
|
310 | - * @param string $url URL of the resource |
|
311 | - * @return string |
|
312 | - */ |
|
313 | - public function getContentType($url = '') { |
|
314 | - $response = $this->request($url); |
|
315 | - if ($response instanceof Response) { |
|
316 | - $header = $response->getHeader('Content-Type'); |
|
317 | - if (is_array($header) && !empty($header)) { |
|
318 | - $parts = explode(';', $header[0]); |
|
319 | - return trim($parts[0]); |
|
320 | - } |
|
321 | - } |
|
322 | - return ''; |
|
323 | - } |
|
324 | - |
|
325 | - /** |
|
326 | - * Returns HTML contents of the page |
|
327 | - * |
|
328 | - * @param string $url URL of the resource |
|
329 | - * @return string |
|
330 | - */ |
|
331 | - public function getHTML($url = '') { |
|
332 | - if (!$this->isHTML($url)) { |
|
333 | - return ''; |
|
334 | - } |
|
335 | - return $this->read($url); |
|
336 | - } |
|
337 | - |
|
338 | - /** |
|
339 | - * Returns HTML contents of the page as a DOMDocument |
|
340 | - * |
|
341 | - * @param string $url URL of the resource |
|
342 | - * @return DOMDocument|false |
|
343 | - */ |
|
344 | - public function getDOM($url = '') { |
|
345 | - $html = $this->getHTML($url); |
|
346 | - if (empty($html)) { |
|
347 | - return false; |
|
348 | - } |
|
349 | - $doc = new DOMDocument(); |
|
15 | + /** |
|
16 | + * @var ClientInterface |
|
17 | + */ |
|
18 | + private $client; |
|
19 | + |
|
20 | + /** |
|
21 | + * @var array |
|
22 | + */ |
|
23 | + private static $cache; |
|
24 | + |
|
25 | + /** |
|
26 | + * Constructor |
|
27 | + * @param ClientInterface $client HTTP Client |
|
28 | + */ |
|
29 | + public function __construct(ClientInterface $client) { |
|
30 | + $this->client = $client; |
|
31 | + } |
|
32 | + |
|
33 | + /** |
|
34 | + * Parses a URL into a an array of metatags |
|
35 | + * |
|
36 | + * @param string $url URL to parse |
|
37 | + * @return array |
|
38 | + */ |
|
39 | + public function parse($url = '') { |
|
40 | + |
|
41 | + $data = $this->getImageData($url); |
|
42 | + if (!$data) { |
|
43 | + $data = $this->getOEmbedData($url); |
|
44 | + } |
|
45 | + if (!$data) { |
|
46 | + $data = $this->getDOMData($url); |
|
47 | + if (is_array($data) && !empty($data['oembed_url'])) { |
|
48 | + foreach ($data['oembed_url'] as $oembed_url) { |
|
49 | + $oembed_data = $this->parse($oembed_url); |
|
50 | + if (!empty($oembed_data) && is_array($oembed_data)) { |
|
51 | + $oembed_data['oembed_url'] = $oembed_data['url']; |
|
52 | + unset($oembed_data['url']); |
|
53 | + $data = array_merge($data, $oembed_data); |
|
54 | + } |
|
55 | + } |
|
56 | + } |
|
57 | + } |
|
58 | + |
|
59 | + if (!is_array($data)) { |
|
60 | + $data = array(); |
|
61 | + } |
|
62 | + |
|
63 | + if (empty($data['thumbnail_url']) && !empty($data['thumbnails'])) { |
|
64 | + $data['thumbnail_url'] = $data['thumbnails'][0]; |
|
65 | + } |
|
66 | + |
|
67 | + return $data; |
|
68 | + } |
|
69 | + |
|
70 | + /** |
|
71 | + * Parses image metatags |
|
72 | + * |
|
73 | + * @param string $url URL of the image |
|
74 | + * @return array|false |
|
75 | + */ |
|
76 | + public function getImageData($url = '') { |
|
77 | + if (!$this->isImage($url)) { |
|
78 | + return false; |
|
79 | + } |
|
80 | + |
|
81 | + return array( |
|
82 | + 'type' => 'photo', |
|
83 | + 'url' => $url, |
|
84 | + 'thumbnails' => array($url), |
|
85 | + ); |
|
86 | + } |
|
87 | + |
|
88 | + /** |
|
89 | + * Parses OEmbed data |
|
90 | + * |
|
91 | + * @param string $url URL of the image |
|
92 | + * @return array|false |
|
93 | + */ |
|
94 | + public function getOEmbedData($url = '') { |
|
95 | + |
|
96 | + if (!$this->isJSON($url) && !$this->isXML($url)) { |
|
97 | + return false; |
|
98 | + } |
|
99 | + |
|
100 | + $meta = array( |
|
101 | + 'url' => $url, |
|
102 | + ); |
|
103 | + |
|
104 | + $content = $this->read($url); |
|
105 | + if (!$content) { |
|
106 | + return $meta; |
|
107 | + } |
|
108 | + |
|
109 | + $data = new \stdClass(); |
|
110 | + if ($this->isJSON($url)) { |
|
111 | + $data = json_decode($content); |
|
112 | + } else if ($this->isXML($url)) { |
|
113 | + $data = simplexml_load_string($content); |
|
114 | + } |
|
115 | + |
|
116 | + $props = array( |
|
117 | + 'type', |
|
118 | + 'version', |
|
119 | + 'title', |
|
120 | + 'author_name', |
|
121 | + 'author_url', |
|
122 | + 'provider_name', |
|
123 | + 'provider_url', |
|
124 | + 'cache_age', |
|
125 | + 'thumbnail_url', |
|
126 | + 'thumbnail_width', |
|
127 | + 'thumbnail_height', |
|
128 | + 'width', |
|
129 | + 'height', |
|
130 | + 'html', |
|
131 | + ); |
|
132 | + foreach ($props as $key) { |
|
133 | + if (!empty($data->$key)) { |
|
134 | + $meta[$key] = (string) $data->$key; |
|
135 | + } |
|
136 | + } |
|
137 | + return $meta; |
|
138 | + } |
|
139 | + |
|
140 | + /** |
|
141 | + * Parses metatags from DOM |
|
142 | + * |
|
143 | + * @param string $url URL |
|
144 | + * @return array|false |
|
145 | + */ |
|
146 | + public function getDOMData($url = '') { |
|
147 | + |
|
148 | + if (!$this->isHTML($url)) { |
|
149 | + return false; |
|
150 | + } |
|
151 | + |
|
152 | + $doc = $this->getDOM($url); |
|
153 | + if (!$doc) { |
|
154 | + return false; |
|
155 | + } |
|
156 | + |
|
157 | + $defaults = array( |
|
158 | + 'url' => $url, |
|
159 | + ); |
|
160 | + |
|
161 | + $link_tags = $this->parseLinkTags($doc); |
|
162 | + $meta_tags = $this->parseMetaTags($doc); |
|
163 | + $img_tags = $this->parseImgTags($doc); |
|
164 | + |
|
165 | + $meta = array_merge_recursive($defaults, $link_tags, $meta_tags, $img_tags); |
|
166 | + |
|
167 | + if (empty($meta['title'])) { |
|
168 | + $meta['title'] = $this->parseTitle($doc); |
|
169 | + } |
|
170 | + |
|
171 | + |
|
172 | + return $meta; |
|
173 | + } |
|
174 | + |
|
175 | + /** |
|
176 | + * Check if URL exists and is reachable by making an HTTP request to retrieve header information |
|
177 | + * |
|
178 | + * @param string $url URL of the resource |
|
179 | + * @return boolean |
|
180 | + */ |
|
181 | + public function exists($url = '') { |
|
182 | + $response = $this->request($url); |
|
183 | + if ($response instanceof Response) { |
|
184 | + return $response->getStatusCode() == 200; |
|
185 | + } |
|
186 | + return false; |
|
187 | + } |
|
188 | + |
|
189 | + /** |
|
190 | + * Validate URL |
|
191 | + * |
|
192 | + * @param string $url URL to validate |
|
193 | + * @return bool |
|
194 | + */ |
|
195 | + public function isValidUrl($url = '') { |
|
196 | + // based on http://php.net/manual/en/function.filter-var.php#104160 |
|
197 | + // adapted by @mrclay in https://github.com/mrclay/Elgg-leaf/blob/62bf31c0ccdaab549a7e585a4412443e09821db3/engine/lib/output.php |
|
198 | + $res = filter_var($url, FILTER_VALIDATE_URL); |
|
199 | + if ($res) { |
|
200 | + return $res; |
|
201 | + } |
|
202 | + // Check if it has unicode chars. |
|
203 | + $l = mb_strlen($url); |
|
204 | + if (strlen($url) == $l) { |
|
205 | + return $res; |
|
206 | + } |
|
207 | + // Replace wide chars by “X”. |
|
208 | + $s = ''; |
|
209 | + for ($i = 0; $i < $l; ++$i) { |
|
210 | + $ch = elgg_substr($url, $i, 1); |
|
211 | + $s .= (strlen($ch) > 1) ? 'X' : $ch; |
|
212 | + } |
|
213 | + // Re-check now. |
|
214 | + return filter_var($s, FILTER_VALIDATE_URL) ? $url : false; |
|
215 | + } |
|
216 | + |
|
217 | + /** |
|
218 | + * Returns head of the resource |
|
219 | + * |
|
220 | + * @param string $url URL of the resource |
|
221 | + * @return Response|false |
|
222 | + */ |
|
223 | + public function request($url = '') { |
|
224 | + $url = str_replace(' ', '%20', $url); |
|
225 | + if (!$this->isValidUrl($url)) { |
|
226 | + return false; |
|
227 | + } |
|
228 | + if (!isset(self::$cache[$url])) { |
|
229 | + try { |
|
230 | + $response = $this->client->request('GET', $url); |
|
231 | + } catch (Exception $e) { |
|
232 | + $response = false; |
|
233 | + error_log("Parser Error for HEAD request ($url): {$e->getMessage()}"); |
|
234 | + } |
|
235 | + self::$cache[$url] = $response; |
|
236 | + } |
|
237 | + |
|
238 | + return self::$cache[$url]; |
|
239 | + } |
|
240 | + |
|
241 | + /** |
|
242 | + * Get contents of the page |
|
243 | + * |
|
244 | + * @param string $url URL of the resource |
|
245 | + * @return string |
|
246 | + */ |
|
247 | + public function read($url = '') { |
|
248 | + $body = ''; |
|
249 | + if (!$this->exists($url)) { |
|
250 | + return $body; |
|
251 | + } |
|
252 | + |
|
253 | + $response = $this->request($url); |
|
254 | + $body = (string) $response->getBody(); |
|
255 | + return $body; |
|
256 | + } |
|
257 | + |
|
258 | + /** |
|
259 | + * Checks if resource is an html page |
|
260 | + * |
|
261 | + * @param string $url URL of the resource |
|
262 | + * @return boolean |
|
263 | + */ |
|
264 | + public function isHTML($url = '') { |
|
265 | + $mime = $this->getContentType($url); |
|
266 | + return strpos($mime, 'text/html') !== false; |
|
267 | + } |
|
268 | + |
|
269 | + /** |
|
270 | + * Checks if resource is JSON |
|
271 | + * |
|
272 | + * @param string $url URL of the resource |
|
273 | + * @return boolean |
|
274 | + */ |
|
275 | + public function isJSON($url = '') { |
|
276 | + $mime = $this->getContentType($url); |
|
277 | + return strpos($mime, 'json') !== false; |
|
278 | + } |
|
279 | + |
|
280 | + /** |
|
281 | + * Checks if resource is XML |
|
282 | + * |
|
283 | + * @param string $url URL of the resource |
|
284 | + * @return boolean |
|
285 | + */ |
|
286 | + public function isXML($url = '') { |
|
287 | + $mime = $this->getContentType($url); |
|
288 | + return strpos($mime, 'xml') !== false; |
|
289 | + } |
|
290 | + |
|
291 | + /** |
|
292 | + * Checks if resource is an image |
|
293 | + * |
|
294 | + * @param string $url URL of the resource |
|
295 | + * @return boolean |
|
296 | + */ |
|
297 | + public function isImage($url = '') { |
|
298 | + $mime = $this->getContentType($url); |
|
299 | + if ($mime) { |
|
300 | + list($simple, ) = explode('/', $mime); |
|
301 | + return ($simple == 'image'); |
|
302 | + } |
|
303 | + |
|
304 | + return false; |
|
305 | + } |
|
306 | + |
|
307 | + /** |
|
308 | + * Get mime type of the URL content |
|
309 | + * |
|
310 | + * @param string $url URL of the resource |
|
311 | + * @return string |
|
312 | + */ |
|
313 | + public function getContentType($url = '') { |
|
314 | + $response = $this->request($url); |
|
315 | + if ($response instanceof Response) { |
|
316 | + $header = $response->getHeader('Content-Type'); |
|
317 | + if (is_array($header) && !empty($header)) { |
|
318 | + $parts = explode(';', $header[0]); |
|
319 | + return trim($parts[0]); |
|
320 | + } |
|
321 | + } |
|
322 | + return ''; |
|
323 | + } |
|
324 | + |
|
325 | + /** |
|
326 | + * Returns HTML contents of the page |
|
327 | + * |
|
328 | + * @param string $url URL of the resource |
|
329 | + * @return string |
|
330 | + */ |
|
331 | + public function getHTML($url = '') { |
|
332 | + if (!$this->isHTML($url)) { |
|
333 | + return ''; |
|
334 | + } |
|
335 | + return $this->read($url); |
|
336 | + } |
|
337 | + |
|
338 | + /** |
|
339 | + * Returns HTML contents of the page as a DOMDocument |
|
340 | + * |
|
341 | + * @param string $url URL of the resource |
|
342 | + * @return DOMDocument|false |
|
343 | + */ |
|
344 | + public function getDOM($url = '') { |
|
345 | + $html = $this->getHTML($url); |
|
346 | + if (empty($html)) { |
|
347 | + return false; |
|
348 | + } |
|
349 | + $doc = new DOMDocument(); |
|
350 | 350 | |
351 | - libxml_use_internal_errors(true); |
|
351 | + libxml_use_internal_errors(true); |
|
352 | 352 | |
353 | - if (is_callable('mb_convert_encoding')) { |
|
354 | - $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); |
|
355 | - } else { |
|
356 | - $doc->loadHTML($html); |
|
357 | - } |
|
358 | - if (!$doc->documentURI) { |
|
359 | - $doc->documentURI = $url; |
|
360 | - } |
|
353 | + if (is_callable('mb_convert_encoding')) { |
|
354 | + $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); |
|
355 | + } else { |
|
356 | + $doc->loadHTML($html); |
|
357 | + } |
|
358 | + if (!$doc->documentURI) { |
|
359 | + $doc->documentURI = $url; |
|
360 | + } |
|
361 | 361 | |
362 | - libxml_clear_errors(); |
|
362 | + libxml_clear_errors(); |
|
363 | 363 | |
364 | - return $doc; |
|
365 | - } |
|
366 | - |
|
367 | - /** |
|
368 | - * Parses document title |
|
369 | - * |
|
370 | - * @param DOMDocument $doc Document |
|
371 | - * @return string |
|
372 | - */ |
|
373 | - public function parseTitle(DOMDocument $doc) { |
|
374 | - $node = $doc->getElementsByTagName('title'); |
|
375 | - $title = $node->item(0)->nodeValue; |
|
376 | - return ($title) ?: ''; |
|
377 | - } |
|
378 | - |
|
379 | - /** |
|
380 | - * Parses <link> tags |
|
381 | - * |
|
382 | - * @param DOMDocument $doc Document |
|
383 | - * @return array |
|
384 | - */ |
|
385 | - public function parseLinkTags(DOMDocument $doc) { |
|
386 | - |
|
387 | - $meta = array( |
|
388 | - 'icons' => [], |
|
389 | - 'thumbnails' => [], |
|
390 | - ); |
|
391 | - |
|
392 | - $nodes = $doc->getElementsByTagName('link'); |
|
393 | - foreach ($nodes as $node) { |
|
394 | - $rel = $node->getAttribute('rel'); |
|
395 | - $href = $node->getAttribute('href'); |
|
396 | - |
|
397 | - switch ($rel) { |
|
398 | - |
|
399 | - case 'icon' : |
|
400 | - $image_url = $this->getAbsoluteURL($doc, $href); |
|
401 | - if ($this->isImage($image_url)) { |
|
402 | - $meta['icons'][] = $image_url; |
|
403 | - } |
|
404 | - break; |
|
405 | - |
|
406 | - case 'canonical' : |
|
407 | - $meta['canonical'] = $this->getAbsoluteURL($doc, $href); |
|
408 | - break; |
|
409 | - |
|
410 | - case 'alternate' : |
|
411 | - $type = $node->getAttribute('type'); |
|
412 | - if (in_array($type, array( |
|
413 | - 'application/json+oembed', |
|
414 | - 'text/json+oembed', |
|
415 | - 'application/xml+oembed', |
|
416 | - 'text/xml+oembed' |
|
417 | - ))) { |
|
418 | - $meta['oembed_url'][] = $this->getAbsoluteURL($doc, $href); |
|
419 | - } |
|
420 | - break; |
|
421 | - } |
|
422 | - } |
|
423 | - |
|
424 | - return $meta; |
|
425 | - } |
|
426 | - |
|
427 | - /** |
|
428 | - * Parses <meta> tags |
|
429 | - * |
|
430 | - * @param DOMDocument $doc Document |
|
431 | - * @return array |
|
432 | - */ |
|
433 | - public function parseMetaTags(DOMDocument $doc) { |
|
434 | - |
|
435 | - $meta = array(); |
|
436 | - |
|
437 | - $nodes = $doc->getElementsByTagName('meta'); |
|
438 | - if (!empty($nodes)) { |
|
439 | - foreach ($nodes as $node) { |
|
440 | - $name = $node->getAttribute('name'); |
|
441 | - if (!$name) { |
|
442 | - $name = $node->getAttribute('property'); |
|
443 | - } |
|
444 | - if (!$name) { |
|
445 | - continue; |
|
446 | - } |
|
447 | - |
|
448 | - $name = strtolower($name); |
|
364 | + return $doc; |
|
365 | + } |
|
366 | + |
|
367 | + /** |
|
368 | + * Parses document title |
|
369 | + * |
|
370 | + * @param DOMDocument $doc Document |
|
371 | + * @return string |
|
372 | + */ |
|
373 | + public function parseTitle(DOMDocument $doc) { |
|
374 | + $node = $doc->getElementsByTagName('title'); |
|
375 | + $title = $node->item(0)->nodeValue; |
|
376 | + return ($title) ?: ''; |
|
377 | + } |
|
378 | + |
|
379 | + /** |
|
380 | + * Parses <link> tags |
|
381 | + * |
|
382 | + * @param DOMDocument $doc Document |
|
383 | + * @return array |
|
384 | + */ |
|
385 | + public function parseLinkTags(DOMDocument $doc) { |
|
386 | + |
|
387 | + $meta = array( |
|
388 | + 'icons' => [], |
|
389 | + 'thumbnails' => [], |
|
390 | + ); |
|
391 | + |
|
392 | + $nodes = $doc->getElementsByTagName('link'); |
|
393 | + foreach ($nodes as $node) { |
|
394 | + $rel = $node->getAttribute('rel'); |
|
395 | + $href = $node->getAttribute('href'); |
|
396 | + |
|
397 | + switch ($rel) { |
|
398 | + |
|
399 | + case 'icon' : |
|
400 | + $image_url = $this->getAbsoluteURL($doc, $href); |
|
401 | + if ($this->isImage($image_url)) { |
|
402 | + $meta['icons'][] = $image_url; |
|
403 | + } |
|
404 | + break; |
|
405 | + |
|
406 | + case 'canonical' : |
|
407 | + $meta['canonical'] = $this->getAbsoluteURL($doc, $href); |
|
408 | + break; |
|
409 | + |
|
410 | + case 'alternate' : |
|
411 | + $type = $node->getAttribute('type'); |
|
412 | + if (in_array($type, array( |
|
413 | + 'application/json+oembed', |
|
414 | + 'text/json+oembed', |
|
415 | + 'application/xml+oembed', |
|
416 | + 'text/xml+oembed' |
|
417 | + ))) { |
|
418 | + $meta['oembed_url'][] = $this->getAbsoluteURL($doc, $href); |
|
419 | + } |
|
420 | + break; |
|
421 | + } |
|
422 | + } |
|
423 | + |
|
424 | + return $meta; |
|
425 | + } |
|
426 | + |
|
427 | + /** |
|
428 | + * Parses <meta> tags |
|
429 | + * |
|
430 | + * @param DOMDocument $doc Document |
|
431 | + * @return array |
|
432 | + */ |
|
433 | + public function parseMetaTags(DOMDocument $doc) { |
|
434 | + |
|
435 | + $meta = array(); |
|
436 | + |
|
437 | + $nodes = $doc->getElementsByTagName('meta'); |
|
438 | + if (!empty($nodes)) { |
|
439 | + foreach ($nodes as $node) { |
|
440 | + $name = $node->getAttribute('name'); |
|
441 | + if (!$name) { |
|
442 | + $name = $node->getAttribute('property'); |
|
443 | + } |
|
444 | + if (!$name) { |
|
445 | + continue; |
|
446 | + } |
|
447 | + |
|
448 | + $name = strtolower($name); |
|
449 | 449 | |
450 | - if ($name == 'og:image:url') { |
|
451 | - $name = 'og:image'; |
|
452 | - } |
|
453 | - |
|
454 | - $content = $node->getAttribute('content'); |
|
455 | - if (isset($meta['metatags'][$name])) { |
|
456 | - if (!is_array($meta['metatags'][$name])) { |
|
457 | - $meta['metatags'][$name] = array($meta['metatags'][$name]); |
|
458 | - } |
|
459 | - $meta['metatags'][$name][] = $content; |
|
460 | - } else { |
|
461 | - $meta['metatags'][$name] = $content; |
|
462 | - } |
|
463 | - |
|
464 | - switch ($name) { |
|
465 | - |
|
466 | - case 'title' : |
|
467 | - case 'og:title' : |
|
468 | - case 'twitter:title' : |
|
469 | - if (empty($meta['title'])) { |
|
470 | - $meta['title'] = $content; |
|
471 | - } |
|
472 | - break; |
|
473 | - |
|
474 | - case 'og:type' : |
|
475 | - if (empty($meta['type'])) { |
|
476 | - $meta['type'] = $content; |
|
477 | - } |
|
478 | - break; |
|
479 | - |
|
480 | - case 'description' : |
|
481 | - case 'og:description' : |
|
482 | - case 'twitter:description' : |
|
483 | - if (empty($meta['description'])) { |
|
484 | - $meta['description'] = $content; |
|
485 | - } |
|
486 | - break; |
|
487 | - |
|
488 | - case 'keywords' : |
|
489 | - if (is_string($content)) { |
|
490 | - $content = explode(',', $content); |
|
491 | - $content = array_map('trim', $content); |
|
492 | - } |
|
493 | - $meta['tags'] = $content; |
|
494 | - break; |
|
495 | - |
|
496 | - case 'og:site_name' : |
|
497 | - case 'twitter:site' : |
|
498 | - if (empty($meta['provider_name'])) { |
|
499 | - $meta['provider_name'] = $content; |
|
500 | - } |
|
501 | - break; |
|
502 | - |
|
503 | - case 'og:image' : |
|
504 | - case 'twitter:image' : |
|
505 | - $image_url = $this->getAbsoluteURL($doc, $content); |
|
506 | - if ($this->isImage($image_url)) { |
|
507 | - $meta['thumbnails'][] = $image_url; |
|
508 | - } |
|
509 | - break; |
|
510 | - } |
|
511 | - } |
|
512 | - } |
|
513 | - |
|
514 | - return $meta; |
|
515 | - } |
|
516 | - |
|
517 | - /** |
|
518 | - * Parses <img> tags |
|
519 | - * |
|
520 | - * @param DOMDocument $doc Document |
|
521 | - * @return array |
|
522 | - */ |
|
523 | - public function parseImgTags(DOMDocument $doc) { |
|
524 | - |
|
525 | - $meta = array( |
|
526 | - 'thumbnails' => [], |
|
527 | - ); |
|
528 | - |
|
529 | - $nodes = $doc->getElementsByTagName('img'); |
|
530 | - foreach ($nodes as $node) { |
|
531 | - $src = $node->getAttribute('src'); |
|
532 | - $image_url = $this->getAbsoluteURL($doc, $src); |
|
533 | - if ($this->isImage($image_url)) { |
|
534 | - $meta['thumbnails'][] = $image_url; |
|
535 | - } |
|
536 | - } |
|
537 | - |
|
538 | - return $meta; |
|
539 | - } |
|
540 | - |
|
541 | - /** |
|
542 | - * Normalizes relative URLs |
|
543 | - * |
|
544 | - * @param DOMDocument $doc Document |
|
545 | - * @param string $href URL to normalize |
|
546 | - * @return string|false |
|
547 | - */ |
|
548 | - public function getAbsoluteURL(DOMDocument $doc, $href = '') { |
|
549 | - |
|
550 | - if (preg_match("/^data:/i", $href)) { |
|
551 | - // data URIs can not be resolved |
|
552 | - return false; |
|
553 | - } |
|
554 | - |
|
555 | - // Check if $url is absolute |
|
556 | - if (parse_url($href, PHP_URL_HOST)) { |
|
557 | - return $href; |
|
558 | - } |
|
559 | - |
|
560 | - $uri = trim($doc->documentURI ?: '', '/'); |
|
561 | - |
|
562 | - $scheme = parse_url($uri, PHP_URL_SCHEME); |
|
563 | - $host = parse_url($uri, PHP_URL_HOST); |
|
564 | - |
|
565 | - if (substr($href, 0, 1) === "/") { |
|
566 | - // URL is relative to site root |
|
567 | - return "$scheme://$host$href"; |
|
568 | - } |
|
569 | - |
|
570 | - // URL is relative to page |
|
571 | - $path = parse_url($uri, PHP_URL_PATH); |
|
572 | - |
|
573 | - return "$scheme://$host$path/$href"; |
|
574 | - } |
|
450 | + if ($name == 'og:image:url') { |
|
451 | + $name = 'og:image'; |
|
452 | + } |
|
453 | + |
|
454 | + $content = $node->getAttribute('content'); |
|
455 | + if (isset($meta['metatags'][$name])) { |
|
456 | + if (!is_array($meta['metatags'][$name])) { |
|
457 | + $meta['metatags'][$name] = array($meta['metatags'][$name]); |
|
458 | + } |
|
459 | + $meta['metatags'][$name][] = $content; |
|
460 | + } else { |
|
461 | + $meta['metatags'][$name] = $content; |
|
462 | + } |
|
463 | + |
|
464 | + switch ($name) { |
|
465 | + |
|
466 | + case 'title' : |
|
467 | + case 'og:title' : |
|
468 | + case 'twitter:title' : |
|
469 | + if (empty($meta['title'])) { |
|
470 | + $meta['title'] = $content; |
|
471 | + } |
|
472 | + break; |
|
473 | + |
|
474 | + case 'og:type' : |
|
475 | + if (empty($meta['type'])) { |
|
476 | + $meta['type'] = $content; |
|
477 | + } |
|
478 | + break; |
|
479 | + |
|
480 | + case 'description' : |
|
481 | + case 'og:description' : |
|
482 | + case 'twitter:description' : |
|
483 | + if (empty($meta['description'])) { |
|
484 | + $meta['description'] = $content; |
|
485 | + } |
|
486 | + break; |
|
487 | + |
|
488 | + case 'keywords' : |
|
489 | + if (is_string($content)) { |
|
490 | + $content = explode(',', $content); |
|
491 | + $content = array_map('trim', $content); |
|
492 | + } |
|
493 | + $meta['tags'] = $content; |
|
494 | + break; |
|
495 | + |
|
496 | + case 'og:site_name' : |
|
497 | + case 'twitter:site' : |
|
498 | + if (empty($meta['provider_name'])) { |
|
499 | + $meta['provider_name'] = $content; |
|
500 | + } |
|
501 | + break; |
|
502 | + |
|
503 | + case 'og:image' : |
|
504 | + case 'twitter:image' : |
|
505 | + $image_url = $this->getAbsoluteURL($doc, $content); |
|
506 | + if ($this->isImage($image_url)) { |
|
507 | + $meta['thumbnails'][] = $image_url; |
|
508 | + } |
|
509 | + break; |
|
510 | + } |
|
511 | + } |
|
512 | + } |
|
513 | + |
|
514 | + return $meta; |
|
515 | + } |
|
516 | + |
|
517 | + /** |
|
518 | + * Parses <img> tags |
|
519 | + * |
|
520 | + * @param DOMDocument $doc Document |
|
521 | + * @return array |
|
522 | + */ |
|
523 | + public function parseImgTags(DOMDocument $doc) { |
|
524 | + |
|
525 | + $meta = array( |
|
526 | + 'thumbnails' => [], |
|
527 | + ); |
|
528 | + |
|
529 | + $nodes = $doc->getElementsByTagName('img'); |
|
530 | + foreach ($nodes as $node) { |
|
531 | + $src = $node->getAttribute('src'); |
|
532 | + $image_url = $this->getAbsoluteURL($doc, $src); |
|
533 | + if ($this->isImage($image_url)) { |
|
534 | + $meta['thumbnails'][] = $image_url; |
|
535 | + } |
|
536 | + } |
|
537 | + |
|
538 | + return $meta; |
|
539 | + } |
|
540 | + |
|
541 | + /** |
|
542 | + * Normalizes relative URLs |
|
543 | + * |
|
544 | + * @param DOMDocument $doc Document |
|
545 | + * @param string $href URL to normalize |
|
546 | + * @return string|false |
|
547 | + */ |
|
548 | + public function getAbsoluteURL(DOMDocument $doc, $href = '') { |
|
549 | + |
|
550 | + if (preg_match("/^data:/i", $href)) { |
|
551 | + // data URIs can not be resolved |
|
552 | + return false; |
|
553 | + } |
|
554 | + |
|
555 | + // Check if $url is absolute |
|
556 | + if (parse_url($href, PHP_URL_HOST)) { |
|
557 | + return $href; |
|
558 | + } |
|
559 | + |
|
560 | + $uri = trim($doc->documentURI ?: '', '/'); |
|
561 | + |
|
562 | + $scheme = parse_url($uri, PHP_URL_SCHEME); |
|
563 | + $host = parse_url($uri, PHP_URL_HOST); |
|
564 | + |
|
565 | + if (substr($href, 0, 1) === "/") { |
|
566 | + // URL is relative to site root |
|
567 | + return "$scheme://$host$href"; |
|
568 | + } |
|
569 | + |
|
570 | + // URL is relative to page |
|
571 | + $path = parse_url($uri, PHP_URL_PATH); |
|
572 | + |
|
573 | + return "$scheme://$host$path/$href"; |
|
574 | + } |
|
575 | 575 | |
576 | 576 | } |