@@ -12,554 +12,554 @@ |
||
| 12 | 12 | */ |
| 13 | 13 | class Parser { |
| 14 | 14 | |
| 15 | - /** |
|
| 16 | - * @var ClientInterface |
|
| 17 | - */ |
|
| 18 | - private $client; |
|
| 19 | - |
|
| 20 | - /** |
|
| 21 | - * @var array |
|
| 22 | - */ |
|
| 23 | - private static $cache; |
|
| 24 | - |
|
| 25 | - /** |
|
| 26 | - * Constructor |
|
| 27 | - * @param ClientInterface $client HTTP Client |
|
| 28 | - */ |
|
| 29 | - public function __construct(ClientInterface $client) { |
|
| 30 | - $this->client = $client; |
|
| 31 | - } |
|
| 32 | - |
|
| 33 | - /** |
|
| 34 | - * Parses a URL into a an array of metatags |
|
| 35 | - * |
|
| 36 | - * @param string $url URL to parse |
|
| 37 | - * @return array |
|
| 38 | - */ |
|
| 39 | - public function parse($url = '') { |
|
| 40 | - |
|
| 41 | - $data = $this->getImageData($url); |
|
| 42 | - if (!$data) { |
|
| 43 | - $data = $this->getOEmbedData($url); |
|
| 44 | - } |
|
| 45 | - if (!$data) { |
|
| 46 | - $data = $this->getDOMData($url); |
|
| 47 | - if (is_array($data) && !empty($data['oembed_url'])) { |
|
| 48 | - foreach ($data['oembed_url'] as $oembed_url) { |
|
| 49 | - $oembed_data = $this->parse($oembed_url); |
|
| 50 | - if (!empty($oembed_data) && is_array($oembed_data)) { |
|
| 51 | - $oembed_data['oembed_url'] = $oembed_data['url']; |
|
| 52 | - unset($oembed_data['url']); |
|
| 53 | - $data = array_merge($data, $oembed_data); |
|
| 54 | - } |
|
| 55 | - } |
|
| 56 | - } |
|
| 57 | - } |
|
| 58 | - |
|
| 59 | - if (!is_array($data)) { |
|
| 60 | - $data = array(); |
|
| 61 | - } |
|
| 62 | - |
|
| 63 | - if (empty($data['thumbnail_url']) && !empty($data['thumbnails'])) { |
|
| 64 | - $data['thumbnail_url'] = $data['thumbnails'][0]; |
|
| 65 | - } |
|
| 66 | - |
|
| 67 | - return $data; |
|
| 68 | - } |
|
| 69 | - |
|
| 70 | - /** |
|
| 71 | - * Parses image metatags |
|
| 72 | - * |
|
| 73 | - * @param string $url URL of the image |
|
| 74 | - * @return array|false |
|
| 75 | - */ |
|
| 76 | - public function getImageData($url = '') { |
|
| 77 | - if (!$this->isImage($url)) { |
|
| 78 | - return false; |
|
| 79 | - } |
|
| 80 | - |
|
| 81 | - return array( |
|
| 82 | - 'type' => 'photo', |
|
| 83 | - 'url' => $url, |
|
| 84 | - 'thumbnails' => array($url), |
|
| 85 | - ); |
|
| 86 | - } |
|
| 87 | - |
|
| 88 | - /** |
|
| 89 | - * Parses OEmbed data |
|
| 90 | - * |
|
| 91 | - * @param string $url URL of the image |
|
| 92 | - * @return array|false |
|
| 93 | - */ |
|
| 94 | - public function getOEmbedData($url = '') { |
|
| 95 | - |
|
| 96 | - if (!$this->isJSON($url) && !$this->isXML($url)) { |
|
| 97 | - return false; |
|
| 98 | - } |
|
| 99 | - |
|
| 100 | - $meta = array( |
|
| 101 | - 'url' => $url, |
|
| 102 | - ); |
|
| 103 | - |
|
| 104 | - $content = $this->read($url); |
|
| 105 | - if (!$content) { |
|
| 106 | - return $meta; |
|
| 107 | - } |
|
| 108 | - |
|
| 109 | - $data = new \stdClass(); |
|
| 110 | - if ($this->isJSON($url)) { |
|
| 111 | - $data = json_decode($content); |
|
| 112 | - } else if ($this->isXML($url)) { |
|
| 113 | - $data = simplexml_load_string($content); |
|
| 114 | - } |
|
| 115 | - |
|
| 116 | - $props = array( |
|
| 117 | - 'type', |
|
| 118 | - 'version', |
|
| 119 | - 'title', |
|
| 120 | - 'author_name', |
|
| 121 | - 'author_url', |
|
| 122 | - 'provider_name', |
|
| 123 | - 'provider_url', |
|
| 124 | - 'cache_age', |
|
| 125 | - 'thumbnail_url', |
|
| 126 | - 'thumbnail_width', |
|
| 127 | - 'thumbnail_height', |
|
| 128 | - 'width', |
|
| 129 | - 'height', |
|
| 130 | - 'html', |
|
| 131 | - ); |
|
| 132 | - foreach ($props as $key) { |
|
| 133 | - if (!empty($data->$key)) { |
|
| 134 | - $meta[$key] = (string) $data->$key; |
|
| 135 | - } |
|
| 136 | - } |
|
| 137 | - return $meta; |
|
| 138 | - } |
|
| 139 | - |
|
| 140 | - /** |
|
| 141 | - * Parses metatags from DOM |
|
| 142 | - * |
|
| 143 | - * @param string $url URL |
|
| 144 | - * @return array|false |
|
| 145 | - */ |
|
| 146 | - public function getDOMData($url = '') { |
|
| 147 | - |
|
| 148 | - if (!$this->isHTML($url)) { |
|
| 149 | - return false; |
|
| 150 | - } |
|
| 151 | - |
|
| 152 | - $doc = $this->getDOM($url); |
|
| 153 | - if (!$doc) { |
|
| 154 | - return false; |
|
| 155 | - } |
|
| 156 | - |
|
| 157 | - $defaults = array( |
|
| 158 | - 'url' => $url, |
|
| 159 | - ); |
|
| 160 | - |
|
| 161 | - $link_tags = $this->parseLinkTags($doc); |
|
| 162 | - $meta_tags = $this->parseMetaTags($doc); |
|
| 163 | - $img_tags = $this->parseImgTags($doc); |
|
| 164 | - |
|
| 165 | - $meta = array_merge_recursive($defaults, $link_tags, $meta_tags, $img_tags); |
|
| 166 | - |
|
| 167 | - if (empty($meta['title'])) { |
|
| 168 | - $meta['title'] = $this->parseTitle($doc); |
|
| 169 | - } |
|
| 170 | - |
|
| 171 | - |
|
| 172 | - return $meta; |
|
| 173 | - } |
|
| 174 | - |
|
| 175 | - /** |
|
| 176 | - * Check if URL exists and is reachable by making an HTTP request to retrieve header information |
|
| 177 | - * |
|
| 178 | - * @param string $url URL of the resource |
|
| 179 | - * @return boolean |
|
| 180 | - */ |
|
| 181 | - public function exists($url = '') { |
|
| 182 | - $response = $this->request($url); |
|
| 183 | - if ($response instanceof Response) { |
|
| 184 | - return $response->getStatusCode() == 200; |
|
| 185 | - } |
|
| 186 | - return false; |
|
| 187 | - } |
|
| 188 | - |
|
| 189 | - /** |
|
| 190 | - * Validate URL |
|
| 191 | - * |
|
| 192 | - * @param string $url URL to validate |
|
| 193 | - * @return bool |
|
| 194 | - */ |
|
| 195 | - public function isValidUrl($url = '') { |
|
| 196 | - // based on http://php.net/manual/en/function.filter-var.php#104160 |
|
| 197 | - // adapted by @mrclay in https://github.com/mrclay/Elgg-leaf/blob/62bf31c0ccdaab549a7e585a4412443e09821db3/engine/lib/output.php |
|
| 198 | - $res = filter_var($url, FILTER_VALIDATE_URL); |
|
| 199 | - if ($res) { |
|
| 200 | - return $res; |
|
| 201 | - } |
|
| 202 | - // Check if it has unicode chars. |
|
| 203 | - $l = elgg_strlen($url); |
|
| 204 | - if (strlen($url) == $l) { |
|
| 205 | - return $res; |
|
| 206 | - } |
|
| 207 | - // Replace wide chars by “X”. |
|
| 208 | - $s = ''; |
|
| 209 | - for ($i = 0; $i < $l; ++$i) { |
|
| 210 | - $ch = elgg_substr($url, $i, 1); |
|
| 211 | - $s .= (strlen($ch) > 1) ? 'X' : $ch; |
|
| 212 | - } |
|
| 213 | - // Re-check now. |
|
| 214 | - return filter_var($s, FILTER_VALIDATE_URL) ? $url : false; |
|
| 215 | - } |
|
| 216 | - |
|
| 217 | - /** |
|
| 218 | - * Returns head of the resource |
|
| 219 | - * |
|
| 220 | - * @param string $url URL of the resource |
|
| 221 | - * @return Response|false |
|
| 222 | - */ |
|
| 223 | - public function request($url = '') { |
|
| 224 | - $url = str_replace(' ', '%20', $url); |
|
| 225 | - if (!$this->isValidUrl($url)) { |
|
| 226 | - return false; |
|
| 227 | - } |
|
| 228 | - if (!isset(self::$cache[$url])) { |
|
| 229 | - try { |
|
| 230 | - $response = $this->client->request('GET', $url); |
|
| 231 | - } catch (Exception $e) { |
|
| 232 | - $response = false; |
|
| 233 | - error_log("Parser Error for HEAD request ($url): {$e->getMessage()}"); |
|
| 234 | - } |
|
| 235 | - self::$cache[$url] = $response; |
|
| 236 | - } |
|
| 237 | - |
|
| 238 | - return self::$cache[$url]; |
|
| 239 | - } |
|
| 240 | - |
|
| 241 | - /** |
|
| 242 | - * Get contents of the page |
|
| 243 | - * |
|
| 244 | - * @param string $url URL of the resource |
|
| 245 | - * @return string |
|
| 246 | - */ |
|
| 247 | - public function read($url = '') { |
|
| 248 | - $body = ''; |
|
| 249 | - if (!$this->exists($url)) { |
|
| 250 | - return $body; |
|
| 251 | - } |
|
| 252 | - |
|
| 253 | - $response = $this->request($url); |
|
| 254 | - $body = (string) $response->getBody(); |
|
| 255 | - return $body; |
|
| 256 | - } |
|
| 257 | - |
|
| 258 | - /** |
|
| 259 | - * Checks if resource is an html page |
|
| 260 | - * |
|
| 261 | - * @param string $url URL of the resource |
|
| 262 | - * @return boolean |
|
| 263 | - */ |
|
| 264 | - public function isHTML($url = '') { |
|
| 265 | - $mime = $this->getContentType($url); |
|
| 266 | - return strpos($mime, 'text/html') !== false; |
|
| 267 | - } |
|
| 268 | - |
|
| 269 | - /** |
|
| 270 | - * Checks if resource is JSON |
|
| 271 | - * |
|
| 272 | - * @param string $url URL of the resource |
|
| 273 | - * @return boolean |
|
| 274 | - */ |
|
| 275 | - public function isJSON($url = '') { |
|
| 276 | - $mime = $this->getContentType($url); |
|
| 277 | - return strpos($mime, 'json') !== false; |
|
| 278 | - } |
|
| 279 | - |
|
| 280 | - /** |
|
| 281 | - * Checks if resource is XML |
|
| 282 | - * |
|
| 283 | - * @param string $url URL of the resource |
|
| 284 | - * @return boolean |
|
| 285 | - */ |
|
| 286 | - public function isXML($url = '') { |
|
| 287 | - $mime = $this->getContentType($url); |
|
| 288 | - return strpos($mime, 'xml') !== false; |
|
| 289 | - } |
|
| 290 | - |
|
| 291 | - /** |
|
| 292 | - * Checks if resource is an image |
|
| 293 | - * |
|
| 294 | - * @param string $url URL of the resource |
|
| 295 | - * @return boolean |
|
| 296 | - */ |
|
| 297 | - public function isImage($url = '') { |
|
| 298 | - $mime = $this->getContentType($url); |
|
| 299 | - if ($mime) { |
|
| 300 | - list($simple, ) = explode('/', $mime); |
|
| 301 | - return ($simple == 'image'); |
|
| 302 | - } |
|
| 303 | - |
|
| 304 | - return false; |
|
| 305 | - } |
|
| 306 | - |
|
| 307 | - /** |
|
| 308 | - * Get mime type of the URL content |
|
| 309 | - * |
|
| 310 | - * @param string $url URL of the resource |
|
| 311 | - * @return string |
|
| 312 | - */ |
|
| 313 | - public function getContentType($url = '') { |
|
| 314 | - $response = $this->request($url); |
|
| 315 | - if ($response instanceof Response) { |
|
| 316 | - $header = $response->getHeader('Content-Type'); |
|
| 317 | - if (is_array($header) && !empty($header)) { |
|
| 318 | - $parts = explode(';', $header[0]); |
|
| 319 | - return trim($parts[0]); |
|
| 320 | - } |
|
| 321 | - } |
|
| 322 | - return ''; |
|
| 323 | - } |
|
| 324 | - |
|
| 325 | - /** |
|
| 326 | - * Returns HTML contents of the page |
|
| 327 | - * |
|
| 328 | - * @param string $url URL of the resource |
|
| 329 | - * @return string |
|
| 330 | - */ |
|
| 331 | - public function getHTML($url = '') { |
|
| 332 | - if (!$this->isHTML($url)) { |
|
| 333 | - return ''; |
|
| 334 | - } |
|
| 335 | - return $this->read($url); |
|
| 336 | - } |
|
| 337 | - |
|
| 338 | - /** |
|
| 339 | - * Returns HTML contents of the page as a DOMDocument |
|
| 340 | - * |
|
| 341 | - * @param string $url URL of the resource |
|
| 342 | - * @return DOMDocument|false |
|
| 343 | - */ |
|
| 344 | - public function getDOM($url = '') { |
|
| 345 | - $html = $this->getHTML($url); |
|
| 346 | - if (empty($html)) { |
|
| 347 | - return false; |
|
| 348 | - } |
|
| 349 | - $doc = new DOMDocument(); |
|
| 15 | + /** |
|
| 16 | + * @var ClientInterface |
|
| 17 | + */ |
|
| 18 | + private $client; |
|
| 19 | + |
|
| 20 | + /** |
|
| 21 | + * @var array |
|
| 22 | + */ |
|
| 23 | + private static $cache; |
|
| 24 | + |
|
| 25 | + /** |
|
| 26 | + * Constructor |
|
| 27 | + * @param ClientInterface $client HTTP Client |
|
| 28 | + */ |
|
| 29 | + public function __construct(ClientInterface $client) { |
|
| 30 | + $this->client = $client; |
|
| 31 | + } |
|
| 32 | + |
|
| 33 | + /** |
|
| 34 | + * Parses a URL into a an array of metatags |
|
| 35 | + * |
|
| 36 | + * @param string $url URL to parse |
|
| 37 | + * @return array |
|
| 38 | + */ |
|
| 39 | + public function parse($url = '') { |
|
| 40 | + |
|
| 41 | + $data = $this->getImageData($url); |
|
| 42 | + if (!$data) { |
|
| 43 | + $data = $this->getOEmbedData($url); |
|
| 44 | + } |
|
| 45 | + if (!$data) { |
|
| 46 | + $data = $this->getDOMData($url); |
|
| 47 | + if (is_array($data) && !empty($data['oembed_url'])) { |
|
| 48 | + foreach ($data['oembed_url'] as $oembed_url) { |
|
| 49 | + $oembed_data = $this->parse($oembed_url); |
|
| 50 | + if (!empty($oembed_data) && is_array($oembed_data)) { |
|
| 51 | + $oembed_data['oembed_url'] = $oembed_data['url']; |
|
| 52 | + unset($oembed_data['url']); |
|
| 53 | + $data = array_merge($data, $oembed_data); |
|
| 54 | + } |
|
| 55 | + } |
|
| 56 | + } |
|
| 57 | + } |
|
| 58 | + |
|
| 59 | + if (!is_array($data)) { |
|
| 60 | + $data = array(); |
|
| 61 | + } |
|
| 62 | + |
|
| 63 | + if (empty($data['thumbnail_url']) && !empty($data['thumbnails'])) { |
|
| 64 | + $data['thumbnail_url'] = $data['thumbnails'][0]; |
|
| 65 | + } |
|
| 66 | + |
|
| 67 | + return $data; |
|
| 68 | + } |
|
| 69 | + |
|
| 70 | + /** |
|
| 71 | + * Parses image metatags |
|
| 72 | + * |
|
| 73 | + * @param string $url URL of the image |
|
| 74 | + * @return array|false |
|
| 75 | + */ |
|
| 76 | + public function getImageData($url = '') { |
|
| 77 | + if (!$this->isImage($url)) { |
|
| 78 | + return false; |
|
| 79 | + } |
|
| 80 | + |
|
| 81 | + return array( |
|
| 82 | + 'type' => 'photo', |
|
| 83 | + 'url' => $url, |
|
| 84 | + 'thumbnails' => array($url), |
|
| 85 | + ); |
|
| 86 | + } |
|
| 87 | + |
|
| 88 | + /** |
|
| 89 | + * Parses OEmbed data |
|
| 90 | + * |
|
| 91 | + * @param string $url URL of the image |
|
| 92 | + * @return array|false |
|
| 93 | + */ |
|
| 94 | + public function getOEmbedData($url = '') { |
|
| 95 | + |
|
| 96 | + if (!$this->isJSON($url) && !$this->isXML($url)) { |
|
| 97 | + return false; |
|
| 98 | + } |
|
| 99 | + |
|
| 100 | + $meta = array( |
|
| 101 | + 'url' => $url, |
|
| 102 | + ); |
|
| 103 | + |
|
| 104 | + $content = $this->read($url); |
|
| 105 | + if (!$content) { |
|
| 106 | + return $meta; |
|
| 107 | + } |
|
| 108 | + |
|
| 109 | + $data = new \stdClass(); |
|
| 110 | + if ($this->isJSON($url)) { |
|
| 111 | + $data = json_decode($content); |
|
| 112 | + } else if ($this->isXML($url)) { |
|
| 113 | + $data = simplexml_load_string($content); |
|
| 114 | + } |
|
| 115 | + |
|
| 116 | + $props = array( |
|
| 117 | + 'type', |
|
| 118 | + 'version', |
|
| 119 | + 'title', |
|
| 120 | + 'author_name', |
|
| 121 | + 'author_url', |
|
| 122 | + 'provider_name', |
|
| 123 | + 'provider_url', |
|
| 124 | + 'cache_age', |
|
| 125 | + 'thumbnail_url', |
|
| 126 | + 'thumbnail_width', |
|
| 127 | + 'thumbnail_height', |
|
| 128 | + 'width', |
|
| 129 | + 'height', |
|
| 130 | + 'html', |
|
| 131 | + ); |
|
| 132 | + foreach ($props as $key) { |
|
| 133 | + if (!empty($data->$key)) { |
|
| 134 | + $meta[$key] = (string) $data->$key; |
|
| 135 | + } |
|
| 136 | + } |
|
| 137 | + return $meta; |
|
| 138 | + } |
|
| 139 | + |
|
| 140 | + /** |
|
| 141 | + * Parses metatags from DOM |
|
| 142 | + * |
|
| 143 | + * @param string $url URL |
|
| 144 | + * @return array|false |
|
| 145 | + */ |
|
| 146 | + public function getDOMData($url = '') { |
|
| 147 | + |
|
| 148 | + if (!$this->isHTML($url)) { |
|
| 149 | + return false; |
|
| 150 | + } |
|
| 151 | + |
|
| 152 | + $doc = $this->getDOM($url); |
|
| 153 | + if (!$doc) { |
|
| 154 | + return false; |
|
| 155 | + } |
|
| 156 | + |
|
| 157 | + $defaults = array( |
|
| 158 | + 'url' => $url, |
|
| 159 | + ); |
|
| 160 | + |
|
| 161 | + $link_tags = $this->parseLinkTags($doc); |
|
| 162 | + $meta_tags = $this->parseMetaTags($doc); |
|
| 163 | + $img_tags = $this->parseImgTags($doc); |
|
| 164 | + |
|
| 165 | + $meta = array_merge_recursive($defaults, $link_tags, $meta_tags, $img_tags); |
|
| 166 | + |
|
| 167 | + if (empty($meta['title'])) { |
|
| 168 | + $meta['title'] = $this->parseTitle($doc); |
|
| 169 | + } |
|
| 170 | + |
|
| 171 | + |
|
| 172 | + return $meta; |
|
| 173 | + } |
|
| 174 | + |
|
| 175 | + /** |
|
| 176 | + * Check if URL exists and is reachable by making an HTTP request to retrieve header information |
|
| 177 | + * |
|
| 178 | + * @param string $url URL of the resource |
|
| 179 | + * @return boolean |
|
| 180 | + */ |
|
| 181 | + public function exists($url = '') { |
|
| 182 | + $response = $this->request($url); |
|
| 183 | + if ($response instanceof Response) { |
|
| 184 | + return $response->getStatusCode() == 200; |
|
| 185 | + } |
|
| 186 | + return false; |
|
| 187 | + } |
|
| 188 | + |
|
| 189 | + /** |
|
| 190 | + * Validate URL |
|
| 191 | + * |
|
| 192 | + * @param string $url URL to validate |
|
| 193 | + * @return bool |
|
| 194 | + */ |
|
| 195 | + public function isValidUrl($url = '') { |
|
| 196 | + // based on http://php.net/manual/en/function.filter-var.php#104160 |
|
| 197 | + // adapted by @mrclay in https://github.com/mrclay/Elgg-leaf/blob/62bf31c0ccdaab549a7e585a4412443e09821db3/engine/lib/output.php |
|
| 198 | + $res = filter_var($url, FILTER_VALIDATE_URL); |
|
| 199 | + if ($res) { |
|
| 200 | + return $res; |
|
| 201 | + } |
|
| 202 | + // Check if it has unicode chars. |
|
| 203 | + $l = elgg_strlen($url); |
|
| 204 | + if (strlen($url) == $l) { |
|
| 205 | + return $res; |
|
| 206 | + } |
|
| 207 | + // Replace wide chars by “X”. |
|
| 208 | + $s = ''; |
|
| 209 | + for ($i = 0; $i < $l; ++$i) { |
|
| 210 | + $ch = elgg_substr($url, $i, 1); |
|
| 211 | + $s .= (strlen($ch) > 1) ? 'X' : $ch; |
|
| 212 | + } |
|
| 213 | + // Re-check now. |
|
| 214 | + return filter_var($s, FILTER_VALIDATE_URL) ? $url : false; |
|
| 215 | + } |
|
| 216 | + |
|
| 217 | + /** |
|
| 218 | + * Returns head of the resource |
|
| 219 | + * |
|
| 220 | + * @param string $url URL of the resource |
|
| 221 | + * @return Response|false |
|
| 222 | + */ |
|
| 223 | + public function request($url = '') { |
|
| 224 | + $url = str_replace(' ', '%20', $url); |
|
| 225 | + if (!$this->isValidUrl($url)) { |
|
| 226 | + return false; |
|
| 227 | + } |
|
| 228 | + if (!isset(self::$cache[$url])) { |
|
| 229 | + try { |
|
| 230 | + $response = $this->client->request('GET', $url); |
|
| 231 | + } catch (Exception $e) { |
|
| 232 | + $response = false; |
|
| 233 | + error_log("Parser Error for HEAD request ($url): {$e->getMessage()}"); |
|
| 234 | + } |
|
| 235 | + self::$cache[$url] = $response; |
|
| 236 | + } |
|
| 237 | + |
|
| 238 | + return self::$cache[$url]; |
|
| 239 | + } |
|
| 240 | + |
|
| 241 | + /** |
|
| 242 | + * Get contents of the page |
|
| 243 | + * |
|
| 244 | + * @param string $url URL of the resource |
|
| 245 | + * @return string |
|
| 246 | + */ |
|
| 247 | + public function read($url = '') { |
|
| 248 | + $body = ''; |
|
| 249 | + if (!$this->exists($url)) { |
|
| 250 | + return $body; |
|
| 251 | + } |
|
| 252 | + |
|
| 253 | + $response = $this->request($url); |
|
| 254 | + $body = (string) $response->getBody(); |
|
| 255 | + return $body; |
|
| 256 | + } |
|
| 257 | + |
|
| 258 | + /** |
|
| 259 | + * Checks if resource is an html page |
|
| 260 | + * |
|
| 261 | + * @param string $url URL of the resource |
|
| 262 | + * @return boolean |
|
| 263 | + */ |
|
| 264 | + public function isHTML($url = '') { |
|
| 265 | + $mime = $this->getContentType($url); |
|
| 266 | + return strpos($mime, 'text/html') !== false; |
|
| 267 | + } |
|
| 268 | + |
|
| 269 | + /** |
|
| 270 | + * Checks if resource is JSON |
|
| 271 | + * |
|
| 272 | + * @param string $url URL of the resource |
|
| 273 | + * @return boolean |
|
| 274 | + */ |
|
| 275 | + public function isJSON($url = '') { |
|
| 276 | + $mime = $this->getContentType($url); |
|
| 277 | + return strpos($mime, 'json') !== false; |
|
| 278 | + } |
|
| 279 | + |
|
| 280 | + /** |
|
| 281 | + * Checks if resource is XML |
|
| 282 | + * |
|
| 283 | + * @param string $url URL of the resource |
|
| 284 | + * @return boolean |
|
| 285 | + */ |
|
| 286 | + public function isXML($url = '') { |
|
| 287 | + $mime = $this->getContentType($url); |
|
| 288 | + return strpos($mime, 'xml') !== false; |
|
| 289 | + } |
|
| 290 | + |
|
| 291 | + /** |
|
| 292 | + * Checks if resource is an image |
|
| 293 | + * |
|
| 294 | + * @param string $url URL of the resource |
|
| 295 | + * @return boolean |
|
| 296 | + */ |
|
| 297 | + public function isImage($url = '') { |
|
| 298 | + $mime = $this->getContentType($url); |
|
| 299 | + if ($mime) { |
|
| 300 | + list($simple, ) = explode('/', $mime); |
|
| 301 | + return ($simple == 'image'); |
|
| 302 | + } |
|
| 303 | + |
|
| 304 | + return false; |
|
| 305 | + } |
|
| 306 | + |
|
| 307 | + /** |
|
| 308 | + * Get mime type of the URL content |
|
| 309 | + * |
|
| 310 | + * @param string $url URL of the resource |
|
| 311 | + * @return string |
|
| 312 | + */ |
|
| 313 | + public function getContentType($url = '') { |
|
| 314 | + $response = $this->request($url); |
|
| 315 | + if ($response instanceof Response) { |
|
| 316 | + $header = $response->getHeader('Content-Type'); |
|
| 317 | + if (is_array($header) && !empty($header)) { |
|
| 318 | + $parts = explode(';', $header[0]); |
|
| 319 | + return trim($parts[0]); |
|
| 320 | + } |
|
| 321 | + } |
|
| 322 | + return ''; |
|
| 323 | + } |
|
| 324 | + |
|
| 325 | + /** |
|
| 326 | + * Returns HTML contents of the page |
|
| 327 | + * |
|
| 328 | + * @param string $url URL of the resource |
|
| 329 | + * @return string |
|
| 330 | + */ |
|
| 331 | + public function getHTML($url = '') { |
|
| 332 | + if (!$this->isHTML($url)) { |
|
| 333 | + return ''; |
|
| 334 | + } |
|
| 335 | + return $this->read($url); |
|
| 336 | + } |
|
| 337 | + |
|
| 338 | + /** |
|
| 339 | + * Returns HTML contents of the page as a DOMDocument |
|
| 340 | + * |
|
| 341 | + * @param string $url URL of the resource |
|
| 342 | + * @return DOMDocument|false |
|
| 343 | + */ |
|
| 344 | + public function getDOM($url = '') { |
|
| 345 | + $html = $this->getHTML($url); |
|
| 346 | + if (empty($html)) { |
|
| 347 | + return false; |
|
| 348 | + } |
|
| 349 | + $doc = new DOMDocument(); |
|
| 350 | 350 | |
| 351 | - libxml_use_internal_errors(true); |
|
| 351 | + libxml_use_internal_errors(true); |
|
| 352 | 352 | |
| 353 | - if (is_callable('mb_convert_encoding')) { |
|
| 354 | - $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); |
|
| 355 | - } else { |
|
| 356 | - $doc->loadHTML($html); |
|
| 357 | - } |
|
| 358 | - if (!$doc->documentURI) { |
|
| 359 | - $doc->documentURI = $url; |
|
| 360 | - } |
|
| 353 | + if (is_callable('mb_convert_encoding')) { |
|
| 354 | + $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); |
|
| 355 | + } else { |
|
| 356 | + $doc->loadHTML($html); |
|
| 357 | + } |
|
| 358 | + if (!$doc->documentURI) { |
|
| 359 | + $doc->documentURI = $url; |
|
| 360 | + } |
|
| 361 | 361 | |
| 362 | - libxml_clear_errors(); |
|
| 362 | + libxml_clear_errors(); |
|
| 363 | 363 | |
| 364 | - return $doc; |
|
| 365 | - } |
|
| 366 | - |
|
| 367 | - /** |
|
| 368 | - * Parses document title |
|
| 369 | - * |
|
| 370 | - * @param DOMDocument $doc Document |
|
| 371 | - * @return string |
|
| 372 | - */ |
|
| 373 | - public function parseTitle(DOMDocument $doc) { |
|
| 374 | - $node = $doc->getElementsByTagName('title'); |
|
| 375 | - $title = $node->item(0)->nodeValue; |
|
| 376 | - return ($title) ?: ''; |
|
| 377 | - } |
|
| 378 | - |
|
| 379 | - /** |
|
| 380 | - * Parses <link> tags |
|
| 381 | - * |
|
| 382 | - * @param DOMDocument $doc Document |
|
| 383 | - * @return array |
|
| 384 | - */ |
|
| 385 | - public function parseLinkTags(DOMDocument $doc) { |
|
| 386 | - |
|
| 387 | - $meta = array(); |
|
| 388 | - |
|
| 389 | - $nodes = $doc->getElementsByTagName('link'); |
|
| 390 | - foreach ($nodes as $node) { |
|
| 391 | - $rel = $node->getAttribute('rel'); |
|
| 392 | - $href = $node->getAttribute('href'); |
|
| 393 | - |
|
| 394 | - switch ($rel) { |
|
| 395 | - |
|
| 396 | - case 'icon' : |
|
| 397 | - $image_url = $this->getAbsoluteURL($doc, $href); |
|
| 398 | - if ($this->isImage($image_url)) { |
|
| 399 | - $meta['icons'][] = $image_url; |
|
| 400 | - } |
|
| 401 | - break; |
|
| 402 | - |
|
| 403 | - case 'canonical' : |
|
| 404 | - $meta['canonical'] = $this->getAbsoluteURL($doc, $href); |
|
| 405 | - break; |
|
| 406 | - |
|
| 407 | - case 'alternate' : |
|
| 408 | - $type = $node->getAttribute('type'); |
|
| 409 | - if (in_array($type, array( |
|
| 410 | - 'application/json+oembed', |
|
| 411 | - 'text/json+oembed', |
|
| 412 | - 'application/xml+oembed', |
|
| 413 | - 'text/xml+oembed' |
|
| 414 | - ))) { |
|
| 415 | - $meta['oembed_url'][] = $this->getAbsoluteURL($doc, $href); |
|
| 416 | - } |
|
| 417 | - break; |
|
| 418 | - } |
|
| 419 | - } |
|
| 420 | - |
|
| 421 | - return $meta; |
|
| 422 | - } |
|
| 423 | - |
|
| 424 | - /** |
|
| 425 | - * Parses <meta> tags |
|
| 426 | - * |
|
| 427 | - * @param DOMDocument $doc Document |
|
| 428 | - * @return array |
|
| 429 | - */ |
|
| 430 | - public function parseMetaTags(DOMDocument $doc) { |
|
| 431 | - |
|
| 432 | - $meta = array(); |
|
| 433 | - |
|
| 434 | - $nodes = $doc->getElementsByTagName('meta'); |
|
| 435 | - if (!empty($nodes)) { |
|
| 436 | - foreach ($nodes as $node) { |
|
| 437 | - $name = $node->getAttribute('name'); |
|
| 438 | - if (!$name) { |
|
| 439 | - $name = $node->getAttribute('property'); |
|
| 440 | - } |
|
| 441 | - if (!$name) { |
|
| 442 | - continue; |
|
| 443 | - } |
|
| 444 | - |
|
| 445 | - $name = strtolower($name); |
|
| 446 | - |
|
| 447 | - $content = $node->getAttribute('content'); |
|
| 448 | - if (isset($meta['metatags'][$name])) { |
|
| 449 | - if (!is_array($meta['metatags'][$name])) { |
|
| 450 | - $meta['metatags'][$name] = array($meta['metatags'][$name]); |
|
| 451 | - } |
|
| 452 | - $meta['metatags'][$name][] = $content; |
|
| 453 | - } else { |
|
| 454 | - $meta['metatags'][$name] = $content; |
|
| 455 | - } |
|
| 456 | - |
|
| 457 | - switch ($name) { |
|
| 458 | - |
|
| 459 | - case 'title' : |
|
| 460 | - case 'og:title' : |
|
| 461 | - case 'twitter:title' : |
|
| 462 | - if (empty($meta['title'])) { |
|
| 463 | - $meta['title'] = $content; |
|
| 464 | - } |
|
| 465 | - break; |
|
| 466 | - |
|
| 467 | - case 'og:type' : |
|
| 468 | - if (empty($meta['type'])) { |
|
| 469 | - $meta['type'] = $content; |
|
| 470 | - } |
|
| 471 | - break; |
|
| 472 | - |
|
| 473 | - case 'description' : |
|
| 474 | - case 'og:description' : |
|
| 475 | - case 'twitter:description' : |
|
| 476 | - if (empty($meta['description'])) { |
|
| 477 | - $meta['description'] = $content; |
|
| 478 | - } |
|
| 479 | - break; |
|
| 480 | - |
|
| 481 | - case 'keywords' : |
|
| 482 | - if (is_string($content)) { |
|
| 483 | - $content = explode(',', $content); |
|
| 484 | - $content = array_map('trim', $content); |
|
| 485 | - } |
|
| 486 | - $meta['tags'] = $content; |
|
| 487 | - break; |
|
| 488 | - |
|
| 489 | - case 'og:site_name' : |
|
| 490 | - case 'twitter:site' : |
|
| 491 | - if (empty($meta['provider_name'])) { |
|
| 492 | - $meta['provider_name'] = $content; |
|
| 493 | - } |
|
| 494 | - break; |
|
| 495 | - |
|
| 496 | - case 'og:image' : |
|
| 497 | - case 'twitter:image' : |
|
| 498 | - $image_url = $this->getAbsoluteURL($doc, $content); |
|
| 499 | - if ($this->isImage($image_url)) { |
|
| 500 | - $meta['thumbnails'][] = $image_url; |
|
| 501 | - } |
|
| 502 | - break; |
|
| 503 | - } |
|
| 504 | - } |
|
| 505 | - } |
|
| 506 | - |
|
| 507 | - return $meta; |
|
| 508 | - } |
|
| 509 | - |
|
| 510 | - /** |
|
| 511 | - * Parses <img> tags |
|
| 512 | - * |
|
| 513 | - * @param DOMDocument $doc Document |
|
| 514 | - * @return array |
|
| 515 | - */ |
|
| 516 | - public function parseImgTags(DOMDocument $doc) { |
|
| 517 | - |
|
| 518 | - $meta = array(); |
|
| 519 | - |
|
| 520 | - $nodes = $doc->getElementsByTagName('img'); |
|
| 521 | - foreach ($nodes as $node) { |
|
| 522 | - $src = $node->getAttribute('src'); |
|
| 523 | - $image_url = $this->getAbsoluteURL($doc, $src); |
|
| 524 | - if ($this->isImage($image_url)) { |
|
| 525 | - $meta['thumbnails'][] = $image_url; |
|
| 526 | - } |
|
| 527 | - } |
|
| 528 | - |
|
| 529 | - return $meta; |
|
| 530 | - } |
|
| 531 | - |
|
| 532 | - /** |
|
| 533 | - * Normalizes relative URLs |
|
| 534 | - * |
|
| 535 | - * @param DOMDocument $doc Document |
|
| 536 | - * @param string $href URL to normalize |
|
| 537 | - * @return string|false |
|
| 538 | - */ |
|
| 539 | - public function getAbsoluteURL(DOMDocument $doc, $href = '') { |
|
| 540 | - |
|
| 541 | - if (preg_match("/^data:/i", $href)) { |
|
| 542 | - // data URIs can not be resolved |
|
| 543 | - return false; |
|
| 544 | - } |
|
| 545 | - |
|
| 546 | - // Check if $url is absolute |
|
| 547 | - if (parse_url($href, PHP_URL_HOST)) { |
|
| 548 | - return $href; |
|
| 549 | - } |
|
| 550 | - |
|
| 551 | - $uri = trim($doc->documentURI ?: '', '/'); |
|
| 552 | - |
|
| 553 | - // Check if $url is relative to root |
|
| 554 | - if (substr($href, 0, 1) === "/") { |
|
| 555 | - $scheme = parse_url($uri, PHP_URL_SCHEME); |
|
| 556 | - $host = parse_url($uri, PHP_URL_HOST); |
|
| 557 | - return "$scheme://$host$href"; |
|
| 558 | - } |
|
| 559 | - |
|
| 560 | - // $url is relative to page |
|
| 561 | - $uri = pathinfo($uri, PATHINFO_DIRNAME); |
|
| 562 | - return "$uri/$href"; |
|
| 563 | - } |
|
| 364 | + return $doc; |
|
| 365 | + } |
|
| 366 | + |
|
| 367 | + /** |
|
| 368 | + * Parses document title |
|
| 369 | + * |
|
| 370 | + * @param DOMDocument $doc Document |
|
| 371 | + * @return string |
|
| 372 | + */ |
|
| 373 | + public function parseTitle(DOMDocument $doc) { |
|
| 374 | + $node = $doc->getElementsByTagName('title'); |
|
| 375 | + $title = $node->item(0)->nodeValue; |
|
| 376 | + return ($title) ?: ''; |
|
| 377 | + } |
|
| 378 | + |
|
| 379 | + /** |
|
| 380 | + * Parses <link> tags |
|
| 381 | + * |
|
| 382 | + * @param DOMDocument $doc Document |
|
| 383 | + * @return array |
|
| 384 | + */ |
|
| 385 | + public function parseLinkTags(DOMDocument $doc) { |
|
| 386 | + |
|
| 387 | + $meta = array(); |
|
| 388 | + |
|
| 389 | + $nodes = $doc->getElementsByTagName('link'); |
|
| 390 | + foreach ($nodes as $node) { |
|
| 391 | + $rel = $node->getAttribute('rel'); |
|
| 392 | + $href = $node->getAttribute('href'); |
|
| 393 | + |
|
| 394 | + switch ($rel) { |
|
| 395 | + |
|
| 396 | + case 'icon' : |
|
| 397 | + $image_url = $this->getAbsoluteURL($doc, $href); |
|
| 398 | + if ($this->isImage($image_url)) { |
|
| 399 | + $meta['icons'][] = $image_url; |
|
| 400 | + } |
|
| 401 | + break; |
|
| 402 | + |
|
| 403 | + case 'canonical' : |
|
| 404 | + $meta['canonical'] = $this->getAbsoluteURL($doc, $href); |
|
| 405 | + break; |
|
| 406 | + |
|
| 407 | + case 'alternate' : |
|
| 408 | + $type = $node->getAttribute('type'); |
|
| 409 | + if (in_array($type, array( |
|
| 410 | + 'application/json+oembed', |
|
| 411 | + 'text/json+oembed', |
|
| 412 | + 'application/xml+oembed', |
|
| 413 | + 'text/xml+oembed' |
|
| 414 | + ))) { |
|
| 415 | + $meta['oembed_url'][] = $this->getAbsoluteURL($doc, $href); |
|
| 416 | + } |
|
| 417 | + break; |
|
| 418 | + } |
|
| 419 | + } |
|
| 420 | + |
|
| 421 | + return $meta; |
|
| 422 | + } |
|
| 423 | + |
|
| 424 | + /** |
|
| 425 | + * Parses <meta> tags |
|
| 426 | + * |
|
| 427 | + * @param DOMDocument $doc Document |
|
| 428 | + * @return array |
|
| 429 | + */ |
|
| 430 | + public function parseMetaTags(DOMDocument $doc) { |
|
| 431 | + |
|
| 432 | + $meta = array(); |
|
| 433 | + |
|
| 434 | + $nodes = $doc->getElementsByTagName('meta'); |
|
| 435 | + if (!empty($nodes)) { |
|
| 436 | + foreach ($nodes as $node) { |
|
| 437 | + $name = $node->getAttribute('name'); |
|
| 438 | + if (!$name) { |
|
| 439 | + $name = $node->getAttribute('property'); |
|
| 440 | + } |
|
| 441 | + if (!$name) { |
|
| 442 | + continue; |
|
| 443 | + } |
|
| 444 | + |
|
| 445 | + $name = strtolower($name); |
|
| 446 | + |
|
| 447 | + $content = $node->getAttribute('content'); |
|
| 448 | + if (isset($meta['metatags'][$name])) { |
|
| 449 | + if (!is_array($meta['metatags'][$name])) { |
|
| 450 | + $meta['metatags'][$name] = array($meta['metatags'][$name]); |
|
| 451 | + } |
|
| 452 | + $meta['metatags'][$name][] = $content; |
|
| 453 | + } else { |
|
| 454 | + $meta['metatags'][$name] = $content; |
|
| 455 | + } |
|
| 456 | + |
|
| 457 | + switch ($name) { |
|
| 458 | + |
|
| 459 | + case 'title' : |
|
| 460 | + case 'og:title' : |
|
| 461 | + case 'twitter:title' : |
|
| 462 | + if (empty($meta['title'])) { |
|
| 463 | + $meta['title'] = $content; |
|
| 464 | + } |
|
| 465 | + break; |
|
| 466 | + |
|
| 467 | + case 'og:type' : |
|
| 468 | + if (empty($meta['type'])) { |
|
| 469 | + $meta['type'] = $content; |
|
| 470 | + } |
|
| 471 | + break; |
|
| 472 | + |
|
| 473 | + case 'description' : |
|
| 474 | + case 'og:description' : |
|
| 475 | + case 'twitter:description' : |
|
| 476 | + if (empty($meta['description'])) { |
|
| 477 | + $meta['description'] = $content; |
|
| 478 | + } |
|
| 479 | + break; |
|
| 480 | + |
|
| 481 | + case 'keywords' : |
|
| 482 | + if (is_string($content)) { |
|
| 483 | + $content = explode(',', $content); |
|
| 484 | + $content = array_map('trim', $content); |
|
| 485 | + } |
|
| 486 | + $meta['tags'] = $content; |
|
| 487 | + break; |
|
| 488 | + |
|
| 489 | + case 'og:site_name' : |
|
| 490 | + case 'twitter:site' : |
|
| 491 | + if (empty($meta['provider_name'])) { |
|
| 492 | + $meta['provider_name'] = $content; |
|
| 493 | + } |
|
| 494 | + break; |
|
| 495 | + |
|
| 496 | + case 'og:image' : |
|
| 497 | + case 'twitter:image' : |
|
| 498 | + $image_url = $this->getAbsoluteURL($doc, $content); |
|
| 499 | + if ($this->isImage($image_url)) { |
|
| 500 | + $meta['thumbnails'][] = $image_url; |
|
| 501 | + } |
|
| 502 | + break; |
|
| 503 | + } |
|
| 504 | + } |
|
| 505 | + } |
|
| 506 | + |
|
| 507 | + return $meta; |
|
| 508 | + } |
|
| 509 | + |
|
| 510 | + /** |
|
| 511 | + * Parses <img> tags |
|
| 512 | + * |
|
| 513 | + * @param DOMDocument $doc Document |
|
| 514 | + * @return array |
|
| 515 | + */ |
|
| 516 | + public function parseImgTags(DOMDocument $doc) { |
|
| 517 | + |
|
| 518 | + $meta = array(); |
|
| 519 | + |
|
| 520 | + $nodes = $doc->getElementsByTagName('img'); |
|
| 521 | + foreach ($nodes as $node) { |
|
| 522 | + $src = $node->getAttribute('src'); |
|
| 523 | + $image_url = $this->getAbsoluteURL($doc, $src); |
|
| 524 | + if ($this->isImage($image_url)) { |
|
| 525 | + $meta['thumbnails'][] = $image_url; |
|
| 526 | + } |
|
| 527 | + } |
|
| 528 | + |
|
| 529 | + return $meta; |
|
| 530 | + } |
|
| 531 | + |
|
| 532 | + /** |
|
| 533 | + * Normalizes relative URLs |
|
| 534 | + * |
|
| 535 | + * @param DOMDocument $doc Document |
|
| 536 | + * @param string $href URL to normalize |
|
| 537 | + * @return string|false |
|
| 538 | + */ |
|
| 539 | + public function getAbsoluteURL(DOMDocument $doc, $href = '') { |
|
| 540 | + |
|
| 541 | + if (preg_match("/^data:/i", $href)) { |
|
| 542 | + // data URIs can not be resolved |
|
| 543 | + return false; |
|
| 544 | + } |
|
| 545 | + |
|
| 546 | + // Check if $url is absolute |
|
| 547 | + if (parse_url($href, PHP_URL_HOST)) { |
|
| 548 | + return $href; |
|
| 549 | + } |
|
| 550 | + |
|
| 551 | + $uri = trim($doc->documentURI ?: '', '/'); |
|
| 552 | + |
|
| 553 | + // Check if $url is relative to root |
|
| 554 | + if (substr($href, 0, 1) === "/") { |
|
| 555 | + $scheme = parse_url($uri, PHP_URL_SCHEME); |
|
| 556 | + $host = parse_url($uri, PHP_URL_HOST); |
|
| 557 | + return "$scheme://$host$href"; |
|
| 558 | + } |
|
| 559 | + |
|
| 560 | + // $url is relative to page |
|
| 561 | + $uri = pathinfo($uri, PATHINFO_DIRNAME); |
|
| 562 | + return "$uri/$href"; |
|
| 563 | + } |
|
| 564 | 564 | |
| 565 | 565 | } |