Complex classes like Parser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
While breaking up the class, it is a good idea to analyze how other classes use Parser, and based on these observations, apply Extract Interface, too.
1 | <?php |
||
13 | class Parser { |
||
14 | |||
15 | /** |
||
16 | * @var ClientInterface |
||
17 | */ |
||
18 | private $client; |
||
19 | |||
20 | /** |
||
21 | * @var array |
||
22 | */ |
||
23 | private static $cache; |
||
24 | |||
25 | /** |
||
26 | * Constructor |
||
27 | * @param ClientInterface $client HTTP Client |
||
28 | */ |
||
29 | public function __construct(ClientInterface $client) { |
||
32 | |||
33 | /** |
||
34 | * Parses a URL into a an array of metatags |
||
35 | * |
||
36 | * @param string $url URL to parse |
||
37 | * @return array |
||
38 | */ |
||
39 | 1 | public function parse($url = '') { |
|
40 | |||
41 | 1 | $data = $this->getImageData($url); |
|
42 | 1 | if (!$data) { |
|
43 | 1 | $data = $this->getOEmbedData($url); |
|
44 | 1 | } |
|
45 | 1 | if (!$data) { |
|
46 | 1 | $data = $this->getDOMData($url); |
|
47 | 1 | if (is_array($data) && !empty($data['oembed_url'])) { |
|
48 | 1 | foreach ($data['oembed_url'] as $oembed_url) { |
|
49 | 1 | $oembed_data = $this->parse($oembed_url); |
|
50 | 1 | if (!empty($oembed_data) && is_array($oembed_data)) { |
|
51 | 1 | $oembed_data['oembed_url'] = $oembed_data['url']; |
|
52 | 1 | unset($oembed_data['url']); |
|
53 | 1 | $data = array_merge($data, $oembed_data); |
|
54 | 1 | } |
|
55 | 1 | } |
|
56 | 1 | } |
|
57 | 1 | } |
|
58 | |||
59 | 1 | if (!is_array($data)) { |
|
60 | 1 | $data = array(); |
|
61 | 1 | } |
|
62 | |||
63 | 1 | if (empty($data['thumbnail_url']) && !empty($data['thumbnails'])) { |
|
64 | 1 | $data['thumbnail_url'] = $data['thumbnails'][0]; |
|
65 | 1 | } |
|
66 | |||
67 | 1 | return $data; |
|
68 | } |
||
69 | |||
70 | /** |
||
71 | * Parses image metatags |
||
72 | * |
||
73 | * @param string $url URL of the image |
||
74 | * @return array|false |
||
75 | */ |
||
76 | 1 | public function getImageData($url = '') { |
|
87 | |||
88 | /** |
||
89 | * Parses OEmbed data |
||
90 | * |
||
91 | * @param string $url URL of the image |
||
92 | * @return array|false |
||
93 | */ |
||
94 | 2 | public function getOEmbedData($url = '') { |
|
139 | |||
140 | /** |
||
141 | * Parses metatags from DOM |
||
142 | * |
||
143 | * @param string $url URL |
||
144 | * @return array|false |
||
145 | */ |
||
146 | 1 | public function getDOMData($url = '') { |
|
174 | |||
175 | /** |
||
176 | * Check if URL exists and is reachable by making an HTTP request to retrieve header information |
||
177 | * |
||
178 | * @param string $url URL of the resource |
||
179 | * @return boolean |
||
180 | */ |
||
181 | 1 | public function exists($url = '') { |
|
188 | |||
189 | /** |
||
190 | * Validate URL |
||
191 | * |
||
192 | * @param string $url URL to validate |
||
193 | * @return bool |
||
194 | */ |
||
195 | public function isValidUrl($url = '') { |
||
196 | // based on http://php.net/manual/en/function.filter-var.php#104160 |
||
197 | // adapted by @mrclay in https://github.com/mrclay/Elgg-leaf/blob/62bf31c0ccdaab549a7e585a4412443e09821db3/engine/lib/output.php |
||
198 | $res = filter_var($url, FILTER_VALIDATE_URL); |
||
199 | if ($res) { |
||
200 | return $res; |
||
201 | } |
||
202 | // Check if it has unicode chars. |
||
203 | $l = mb_strlen($url); |
||
204 | if (strlen($url) == $l) { |
||
205 | return $res; |
||
206 | } |
||
207 | // Replace wide chars by “X”. |
||
208 | $s = ''; |
||
209 | for ($i = 0; $i < $l; ++$i) { |
||
210 | $ch = elgg_substr($url, $i, 1); |
||
211 | $s .= (strlen($ch) > 1) ? 'X' : $ch; |
||
212 | } |
||
213 | // Re-check now. |
||
214 | return filter_var($s, FILTER_VALIDATE_URL) ? $url : false; |
||
|
|||
215 | } |
||
216 | |||
217 | /** |
||
218 | * Returns head of the resource |
||
219 | * |
||
220 | * @param string $url URL of the resource |
||
221 | * @return Response|false |
||
222 | */ |
||
223 | 1 | public function request($url = '') { |
|
224 | 1 | $url = str_replace(' ', '%20', $url); |
|
225 | 1 | if (!$this->isValidUrl($url)) { |
|
226 | return false; |
||
227 | } |
||
228 | 1 | if (!isset(self::$cache[$url])) { |
|
229 | try { |
||
230 | 1 | $response = $this->client->request('GET', $url); |
|
231 | 1 | } catch (Exception $e) { |
|
232 | $response = false; |
||
233 | error_log("Parser Error for HEAD request ($url): {$e->getMessage()}"); |
||
234 | } |
||
235 | 1 | self::$cache[$url] = $response; |
|
236 | 1 | } |
|
237 | |||
238 | 1 | return self::$cache[$url]; |
|
239 | } |
||
240 | |||
241 | /** |
||
242 | * Get contents of the page |
||
243 | * |
||
244 | * @param string $url URL of the resource |
||
245 | * @return string |
||
246 | */ |
||
247 | 1 | public function read($url = '') { |
|
248 | 1 | $body = ''; |
|
249 | 1 | if (!$this->exists($url)) { |
|
250 | 1 | return $body; |
|
251 | } |
||
252 | |||
253 | 1 | $response = $this->request($url); |
|
254 | 1 | $body = (string) $response->getBody(); |
|
255 | 1 | return $body; |
|
256 | } |
||
257 | |||
258 | /** |
||
259 | * Checks if resource is an html page |
||
260 | * |
||
261 | * @param string $url URL of the resource |
||
262 | * @return boolean |
||
263 | */ |
||
264 | 1 | public function isHTML($url = '') { |
|
265 | 1 | $mime = $this->getContentType($url); |
|
266 | 1 | return strpos($mime, 'text/html') !== false; |
|
267 | } |
||
268 | |||
269 | /** |
||
270 | * Checks if resource is JSON |
||
271 | * |
||
272 | * @param string $url URL of the resource |
||
273 | * @return boolean |
||
274 | */ |
||
275 | 1 | public function isJSON($url = '') { |
|
276 | 1 | $mime = $this->getContentType($url); |
|
277 | 1 | return strpos($mime, 'json') !== false; |
|
278 | } |
||
279 | |||
280 | /** |
||
281 | * Checks if resource is XML |
||
282 | * |
||
283 | * @param string $url URL of the resource |
||
284 | * @return boolean |
||
285 | */ |
||
286 | 1 | public function isXML($url = '') { |
|
287 | 1 | $mime = $this->getContentType($url); |
|
288 | 1 | return strpos($mime, 'xml') !== false; |
|
289 | } |
||
290 | |||
291 | /** |
||
292 | * Checks if resource is an image |
||
293 | * |
||
294 | * @param string $url URL of the resource |
||
295 | * @return boolean |
||
296 | */ |
||
297 | 1 | public function isImage($url = '') { |
|
298 | 1 | $mime = $this->getContentType($url); |
|
299 | 1 | if ($mime) { |
|
300 | 1 | list($simple, ) = explode('/', $mime); |
|
301 | 1 | return ($simple == 'image'); |
|
302 | } |
||
303 | |||
304 | 1 | return false; |
|
305 | } |
||
306 | |||
307 | /** |
||
308 | * Get mime type of the URL content |
||
309 | * |
||
310 | * @param string $url URL of the resource |
||
311 | * @return string |
||
312 | */ |
||
313 | 1 | public function getContentType($url = '') { |
|
314 | 1 | $response = $this->request($url); |
|
315 | 1 | if ($response instanceof Response) { |
|
316 | 1 | $header = $response->getHeader('Content-Type'); |
|
317 | 1 | if (is_array($header) && !empty($header)) { |
|
318 | 1 | $parts = explode(';', $header[0]); |
|
319 | 1 | return trim($parts[0]); |
|
320 | } |
||
321 | 1 | } |
|
322 | 1 | return ''; |
|
323 | } |
||
324 | |||
325 | /** |
||
326 | * Returns HTML contents of the page |
||
327 | * |
||
328 | * @param string $url URL of the resource |
||
329 | * @return string |
||
330 | */ |
||
331 | 1 | public function getHTML($url = '') { |
|
332 | 1 | if (!$this->isHTML($url)) { |
|
333 | 1 | return ''; |
|
334 | } |
||
335 | 1 | return $this->read($url); |
|
336 | } |
||
337 | |||
338 | /** |
||
339 | * Returns HTML contents of the page as a DOMDocument |
||
340 | * |
||
341 | * @param string $url URL of the resource |
||
342 | * @return DOMDocument|false |
||
343 | */ |
||
344 | 1 | public function getDOM($url = '') { |
|
345 | 1 | $html = $this->getHTML($url); |
|
346 | 1 | if (empty($html)) { |
|
347 | 1 | return false; |
|
348 | } |
||
349 | 1 | $doc = new DOMDocument(); |
|
350 | |||
351 | 1 | libxml_use_internal_errors(true); |
|
352 | |||
353 | 1 | if (is_callable('mb_convert_encoding')) { |
|
354 | 1 | $doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); |
|
355 | 1 | } else { |
|
356 | $doc->loadHTML($html); |
||
357 | } |
||
358 | 1 | if (!$doc->documentURI) { |
|
359 | 1 | $doc->documentURI = $url; |
|
360 | 1 | } |
|
361 | |||
362 | 1 | libxml_clear_errors(); |
|
363 | |||
364 | 1 | return $doc; |
|
365 | } |
||
366 | |||
367 | /** |
||
368 | * Parses document title |
||
369 | * |
||
370 | * @param DOMDocument $doc Document |
||
371 | * @return string |
||
372 | */ |
||
373 | 1 | public function parseTitle(DOMDocument $doc) { |
|
378 | |||
379 | /** |
||
380 | * Parses <link> tags |
||
381 | * |
||
382 | * @param DOMDocument $doc Document |
||
383 | * @return array |
||
384 | */ |
||
385 | 1 | public function parseLinkTags(DOMDocument $doc) { |
|
386 | |||
387 | $meta = array( |
||
388 | 1 | 'icons' => [], |
|
389 | 1 | 'thumbnails' => [], |
|
426 | |||
427 | /** |
||
428 | * Parses <meta> tags |
||
429 | * |
||
430 | * @param DOMDocument $doc Document |
||
431 | * @return array |
||
432 | */ |
||
433 | 1 | public function parseMetaTags(DOMDocument $doc) { |
|
516 | |||
517 | /** |
||
518 | * Parses <img> tags |
||
519 | * |
||
520 | * @param DOMDocument $doc Document |
||
521 | * @return array |
||
522 | */ |
||
523 | 1 | public function parseImgTags(DOMDocument $doc) { |
|
540 | |||
541 | /** |
||
542 | * Normalizes relative URLs |
||
543 | * |
||
544 | * @param DOMDocument $doc Document |
||
545 | * @param string $href URL to normalize |
||
546 | * @return string|false |
||
547 | */ |
||
548 | 1 | public function getAbsoluteURL(DOMDocument $doc, $href = '') { |
|
575 | |||
576 | } |
||
577 |