Complex classes like Parser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Parser, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
13 | class Parser { |
||
14 | |||
15 | /** |
||
16 | * @var ClientInterface |
||
17 | */ |
||
18 | private $client; |
||
19 | |||
20 | /** |
||
21 | * @var array |
||
22 | */ |
||
23 | private static $cache; |
||
24 | |||
25 | /** |
||
26 | * Constructor |
||
27 | * @param ClientInterface $client HTTP Client |
||
28 | */ |
||
29 | public function __construct(ClientInterface $client) { |
||
32 | |||
33 | /** |
||
34 | * Parses a URL into a an array of metatags |
||
35 | * |
||
36 | * @param string $url URL to parse |
||
37 | * @return array |
||
38 | */ |
||
39 | 1 | public function parse($url = '') { |
|
69 | |||
70 | /** |
||
71 | * Parses image metatags |
||
72 | * |
||
73 | * @param string $url URL of the image |
||
74 | * @return array|false |
||
75 | */ |
||
76 | 1 | public function getImageData($url = '') { |
|
87 | |||
88 | /** |
||
89 | * Parses OEmbed data |
||
90 | * |
||
91 | * @param string $url URL of the image |
||
92 | * @return array|false |
||
93 | */ |
||
94 | 2 | public function getOEmbedData($url = '') { |
|
139 | |||
140 | /** |
||
141 | * Parses metatags from DOM |
||
142 | * |
||
143 | * @param string $url URL |
||
144 | * @return array|false |
||
145 | */ |
||
146 | 1 | public function getDOMData($url = '') { |
|
174 | |||
175 | /** |
||
176 | * Check if URL exists and is reachable by making an HTTP request to retrieve header information |
||
177 | * |
||
178 | * @param string $url URL of the resource |
||
179 | * @return boolean |
||
180 | */ |
||
181 | 1 | public function exists($url = '') { |
|
188 | |||
189 | /** |
||
190 | * Validate URL |
||
191 | * |
||
192 | * @param string $url URL to validate |
||
193 | * @return bool |
||
194 | */ |
||
195 | public function isValidUrl($url = '') { |
||
196 | // based on http://php.net/manual/en/function.filter-var.php#104160 |
||
197 | // adapted by @mrclay in https://github.com/mrclay/Elgg-leaf/blob/62bf31c0ccdaab549a7e585a4412443e09821db3/engine/lib/output.php |
||
198 | $res = filter_var($url, FILTER_VALIDATE_URL); |
||
199 | if ($res) { |
||
200 | return $res; |
||
201 | } |
||
202 | // Check if it has unicode chars. |
||
203 | $l = mb_strlen($url); |
||
204 | if (strlen($url) == $l) { |
||
205 | return $res; |
||
206 | } |
||
207 | // Replace wide chars by “X”. |
||
208 | $s = ''; |
||
209 | for ($i = 0; $i < $l; ++$i) { |
||
210 | $ch = elgg_substr($url, $i, 1); |
||
211 | $s .= (strlen($ch) > 1) ? 'X' : $ch; |
||
212 | } |
||
213 | // Re-check now. |
||
214 | return filter_var($s, FILTER_VALIDATE_URL) ? $url : false; |
||
|
|||
215 | } |
||
216 | |||
217 | /** |
||
218 | * Returns head of the resource |
||
219 | * |
||
220 | * @param string $url URL of the resource |
||
221 | * @return Response|false |
||
222 | */ |
||
223 | 1 | public function request($url = '') { |
|
224 | 1 | $url = str_replace(' ', '%20', $url); |
|
225 | 1 | if (!$this->isValidUrl($url)) { |
|
226 | return false; |
||
227 | } |
||
228 | 1 | if (!isset(self::$cache[$url])) { |
|
229 | try { |
||
230 | 1 | $response = $this->client->request('GET', $url); |
|
231 | 1 | } catch (Exception $e) { |
|
232 | $response = false; |
||
233 | error_log("Parser Error for HEAD request ($url): {$e->getMessage()}"); |
||
234 | } |
||
235 | 1 | self::$cache[$url] = $response; |
|
236 | 1 | } |
|
237 | |||
238 | 1 | return self::$cache[$url]; |
|
239 | } |
||
240 | |||
241 | /** |
||
242 | * Get contents of the page |
||
243 | * |
||
244 | * @param string $url URL of the resource |
||
245 | * @return string |
||
246 | */ |
||
247 | 1 | public function read($url = '') { |
|
248 | 1 | $body = ''; |
|
249 | 1 | if (!$this->exists($url)) { |
|
250 | 1 | return $body; |
|
251 | } |
||
252 | |||
253 | 1 | $response = $this->request($url); |
|
254 | 1 | $body = (string) $response->getBody(); |
|
255 | 1 | return $body; |
|
256 | } |
||
257 | |||
258 | /** |
||
259 | * Checks if resource is an html page |
||
260 | * |
||
261 | * @param string $url URL of the resource |
||
262 | * @return boolean |
||
263 | */ |
||
264 | 1 | public function isHTML($url = '') { |
|
265 | 1 | $mime = $this->getContentType($url); |
|
266 | 1 | return strpos($mime, 'text/html') !== false; |
|
267 | } |
||
268 | |||
269 | /** |
||
270 | * Checks if resource is JSON |
||
271 | * |
||
272 | * @param string $url URL of the resource |
||
273 | * @return boolean |
||
274 | */ |
||
275 | 1 | public function isJSON($url = '') { |
|
276 | 1 | $mime = $this->getContentType($url); |
|
277 | 1 | return strpos($mime, 'json') !== false; |
|
278 | } |
||
279 | |||
280 | /** |
||
281 | * Checks if resource is XML |
||
282 | * |
||
283 | * @param string $url URL of the resource |
||
284 | * @return boolean |
||
285 | */ |
||
286 | 1 | public function isXML($url = '') { |
|
287 | 1 | $mime = $this->getContentType($url); |
|
288 | 1 | return strpos($mime, 'xml') !== false; |
|
289 | } |
||
290 | |||
291 | /** |
||
292 | * Checks if resource is an image |
||
293 | * |
||
294 | * @param string $url URL of the resource |
||
295 | * @return boolean |
||
296 | */ |
||
297 | 1 | public function isImage($url = '') { |
|
298 | 1 | $mime = $this->getContentType($url); |
|
299 | 1 | if ($mime) { |
|
300 | 1 | list($simple, ) = explode('/', $mime); |
|
301 | 1 | return ($simple == 'image'); |
|
302 | } |
||
303 | |||
304 | 1 | return false; |
|
305 | } |
||
306 | |||
307 | /** |
||
308 | * Get mime type of the URL content |
||
309 | * |
||
310 | * @param string $url URL of the resource |
||
311 | * @return string |
||
312 | */ |
||
313 | 1 | public function getContentType($url = '') { |
|
314 | 1 | $response = $this->request($url); |
|
315 | 1 | if ($response instanceof Response) { |
|
316 | 1 | $header = $response->getHeader('Content-Type'); |
|
317 | 1 | if (is_array($header) && !empty($header)) { |
|
318 | 1 | $parts = explode(';', $header[0]); |
|
319 | 1 | return trim($parts[0]); |
|
320 | } |
||
321 | 1 | } |
|
322 | 1 | return ''; |
|
323 | } |
||
324 | |||
325 | /** |
||
326 | * Returns HTML contents of the page |
||
327 | * |
||
328 | * @param string $url URL of the resource |
||
329 | * @return string |
||
330 | */ |
||
331 | 1 | public function getHTML($url = '') { |
|
332 | 1 | if (!$this->isHTML($url)) { |
|
333 | 1 | return ''; |
|
334 | } |
||
335 | 1 | return $this->read($url); |
|
336 | } |
||
337 | |||
338 | /** |
||
339 | * Returns HTML contents of the page as a DOMDocument |
||
340 | * |
||
341 | * @param string $url URL of the resource |
||
342 | * @return DOMDocument|false |
||
343 | */ |
||
344 | 1 | public function getDOM($url = '') { |
|
345 | 1 | $html = $this->getHTML($url); |
|
346 | 1 | if (empty($html)) { |
|
347 | 1 | return false; |
|
348 | } |
||
349 | 1 | $doc = new DOMDocument(); |
|
350 | |||
351 | 1 | libxml_use_internal_errors(true); |
|
352 | |||
353 | 1 | if (is_callable('mb_convert_encoding')) { |
|
354 | 1 | $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); |
|
355 | 1 | } else { |
|
356 | $doc->loadHTML($html); |
||
357 | } |
||
358 | 1 | if (!$doc->documentURI) { |
|
359 | 1 | $doc->documentURI = $url; |
|
360 | 1 | } |
|
361 | |||
362 | 1 | libxml_clear_errors(); |
|
363 | |||
364 | 1 | return $doc; |
|
365 | } |
||
366 | |||
367 | /** |
||
368 | * Parses document title |
||
369 | * |
||
370 | * @param DOMDocument $doc Document |
||
371 | * @return string |
||
372 | */ |
||
373 | 1 | public function parseTitle(DOMDocument $doc) { |
|
378 | |||
379 | /** |
||
380 | * Parses <link> tags |
||
381 | * |
||
382 | * @param DOMDocument $doc Document |
||
383 | * @return array |
||
384 | */ |
||
385 | 1 | public function parseLinkTags(DOMDocument $doc) { |
|
386 | |||
387 | $meta = array( |
||
388 | 1 | 'icons' => [], |
|
426 | |||
427 | /** |
||
428 | * Parses <meta> tags |
||
429 | * |
||
430 | * @param DOMDocument $doc Document |
||
431 | * @return array |
||
432 | */ |
||
433 | 1 | public function parseMetaTags(DOMDocument $doc) { |
|
434 | |||
435 | 1 | $meta = array(); |
|
436 | |||
437 | 1 | $nodes = $doc->getElementsByTagName('meta'); |
|
438 | 1 | if (!empty($nodes)) { |
|
439 | 1 | foreach ($nodes as $node) { |
|
440 | 1 | $name = $node->getAttribute('name'); |
|
441 | 1 | if (!$name) { |
|
442 | 1 | $name = $node->getAttribute('property'); |
|
443 | 1 | } |
|
444 | 1 | if (!$name) { |
|
445 | 1 | continue; |
|
446 | } |
||
447 | |||
448 | 1 | $name = strtolower($name); |
|
449 | |||
450 | 1 | if ($name == 'og:image:url' || $name == 'og:image:secure_url') { |
|
451 | 1 | $name = 'og:image'; |
|
452 | 1 | } |
|
453 | |||
454 | 1 | $content = $node->getAttribute('content'); |
|
455 | 1 | if (isset($meta['metatags'][$name])) { |
|
456 | 1 | if (!is_array($meta['metatags'][$name])) { |
|
457 | 1 | $meta['metatags'][$name] = array($meta['metatags'][$name]); |
|
458 | 1 | } |
|
459 | 1 | $meta['metatags'][$name][] = $content; |
|
460 | 1 | } else { |
|
461 | 1 | $meta['metatags'][$name] = $content; |
|
462 | } |
||
463 | |||
464 | switch ($name) { |
||
465 | |||
466 | 1 | case 'title' : |
|
467 | 1 | case 'og:title' : |
|
468 | 1 | case 'twitter:title' : |
|
469 | 1 | if (empty($meta['title'])) { |
|
470 | 1 | $meta['title'] = $content; |
|
471 | 1 | } |
|
472 | 1 | break; |
|
473 | |||
474 | 1 | case 'og:type' : |
|
475 | 1 | if (empty($meta['type'])) { |
|
476 | 1 | $meta['type'] = $content; |
|
477 | 1 | } |
|
478 | 1 | break; |
|
479 | |||
480 | 1 | case 'description' : |
|
481 | 1 | case 'og:description' : |
|
482 | 1 | case 'twitter:description' : |
|
483 | 1 | if (empty($meta['description'])) { |
|
484 | 1 | $meta['description'] = $content; |
|
485 | 1 | } |
|
486 | 1 | break; |
|
487 | |||
488 | 1 | case 'keywords' : |
|
489 | 1 | if (is_string($content)) { |
|
490 | 1 | $content = explode(',', $content); |
|
491 | 1 | $content = array_map('trim', $content); |
|
492 | 1 | } |
|
493 | 1 | $meta['tags'] = $content; |
|
494 | 1 | break; |
|
495 | |||
496 | 1 | case 'og:site_name' : |
|
497 | 1 | case 'twitter:site' : |
|
498 | 1 | if (empty($meta['provider_name'])) { |
|
499 | 1 | $meta['provider_name'] = $content; |
|
500 | 1 | } |
|
501 | 1 | break; |
|
502 | |||
503 | 1 | case 'og:image' : |
|
504 | 1 | case 'twitter:image' : |
|
505 | 1 | $image_url = $this->getAbsoluteURL($doc, $content); |
|
506 | 1 | if ($this->isImage($image_url)) { |
|
507 | 1 | $meta['thumbnails'][] = $image_url; |
|
508 | 1 | } |
|
509 | 1 | break; |
|
510 | } |
||
511 | 1 | } |
|
512 | 1 | } |
|
513 | |||
514 | 1 | return $meta; |
|
515 | } |
||
516 | |||
517 | /** |
||
518 | * Parses <img> tags |
||
519 | * |
||
520 | * @param DOMDocument $doc Document |
||
521 | * @return array |
||
522 | */ |
||
523 | 1 | public function parseImgTags(DOMDocument $doc) { |
|
540 | |||
541 | /** |
||
542 | * Normalizes relative URLs |
||
543 | * |
||
544 | * @param DOMDocument $doc Document |
||
545 | * @param string $href URL to normalize |
||
546 | * @return string|false |
||
547 | */ |
||
548 | 1 | public function getAbsoluteURL(DOMDocument $doc, $href = '') { |
|
575 | |||
576 | } |
||
577 |