Scraper   B
last analyzed

Complexity

Total Complexity 47

Size/Duplication

Total Lines 235
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 2

Importance

Changes 0
Metric Value
wmc 47
lcom 1
cbo 2
dl 0
loc 235
rs 8.439
c 0
b 0
f 0

10 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 8 2
C baseUrl() 0 44 11
B info() 0 27 5
A endRedirect() 0 4 2
A get() 0 12 2
C getIcons() 0 69 17
B getFavicon() 0 21 5
A getUrl() 0 4 1
A setUrl() 0 4 1
A setDataAccess() 0 4 1

How to fix   Complexity   

Complex Class

Complex classes like Scraper often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use Scraper, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
namespace Mpclarkson\IconScraper;
4
5
class Scraper
6
{
7
    protected $url = '';
8
    protected $dataAccess;
9
10
    public function __construct($args = array())
11
    {
12
        if (isset($args['url'])) {
13
            $this->url = $args['url'];
14
        }
15
16
        $this->dataAccess = new DataAccess();
17
    }
18
19
    /**
20
     * @param string $url
21
     */
22
    public static function baseUrl($url, $path = false)
23
    {
24
        $return = '';
25
26
        if (!$url = parse_url($url)) {
27
            return false;
28
        }
29
30
        // Scheme
31
        $scheme = isset($url['scheme']) ? strtolower($url['scheme']) : null;
32
        if ($scheme != 'http' && $scheme != 'https') {
33
            return false;
34
        }
35
        $return .= "{$scheme}://";
36
37
        // Username and password
38
        if (isset($url['user'])) {
39
            $return .= $url['user'];
40
            if (isset($url['pass'])) {
41
                $return .= ":{$url['pass']}";
42
            }
43
            $return .= '@';
44
        }
45
46
        // Hostname
47
        if (!isset($url['host'])) {
48
            return false;
49
        }
50
51
        $return .= $url['host'];
52
53
        // Port
54
        if (isset($url['port'])) {
55
            $return .= ":{$url['port']}";
56
        }
57
58
        // Path
59
        if ($path && isset($url['path'])) {
60
            $return .= $url['path'];
61
        }
62
        $return .= '/';
63
64
        return $return;
65
    }
66
67
   public function info($url) {
68
        if (empty($url) || $url === false) {
69
            return false;
70
        }
71
72
        $headers = $this->dataAccess->retrieveHeader($url);
73
74
        // leaves only numeric keys
75
        $status_lines = array_filter($headers, function ($key) {
76
            return is_int($key);
77
        }, ARRAY_FILTER_USE_KEY);
78
79
        // uses last returned status line header
80
        $exploded = explode(' ', end($status_lines));
81
82
        if (! array_key_exists(1, $exploded)) {
83
            return false;
84
        }
85
86
        list(, $status) = $exploded;
87
88
        if (isset($headers['location'])) {
89
            $url = $headers['location'];
90
        }
91
92
        return ['status' => $status, 'url' => $url];
93
    }
94
95
    /**
96
     * @param false|string $url
97
     *
98
     * @return string
99
     */
100
    public function endRedirect($url) {
101
        $out = $this->info($url);
102
        return !empty($out['url']) ? $out['url'] : false;
103
    }
104
105
    /**
106
     * @return array of icons or empty array
107
     **/
108
    public function get($url = '')
109
    {
110
111
        // URLs passed to this method take precedence.
112
        if (!empty($url)) {
113
            $this->url = $url;
114
        }
115
116
        $url = rtrim($this->endRedirect($this->baseUrl($this->url, false)), '/');
117
118
        return $this->getIcons($url);
119
    }
120
121
    /**
122
     * @param string $url
123
     */
124
    private function getIcons($url) {
125
126
        if (empty($url)) {
127
            return [];
128
        }
129
130
        $html = $this->dataAccess->retrieveUrl("{$url}/");
131
        preg_match('!<head.*?>.*</head>!ims', $html, $match);
132
133
        if (empty($match) || count($match) == 0) {
134
            return [];
135
        }
136
137
        $head = $match[0];
138
139
        $icons = [];
140
141
        $dom = new \DOMDocument();
142
143
        // Use error supression, because the HTML might be too malformed.
144
        if (@$dom->loadHTML($head)) {
145
            $links = $dom->getElementsByTagName('link');
146
147
            foreach ($links as $link) {
148
149
                if ($link->hasAttribute('rel') && $href = $link->getAttribute('href')) {
150
151
                    $attribute = $link->getAttribute('rel');
152
153
                    // Make sure the href is an absolute URL.
154
                    if ($href && filter_var($href, FILTER_VALIDATE_URL) === false) {
155
                        $href = $url . '/' . $href; //Todo: Improve this
156
                    }
157
158
                    $size = $link->hasAttribute('sizes') ? $link->getAttribute('sizes') : [];
159
                    $size = !is_array($size) ? explode('x', $size) : $size;
160
161
                    $type = false;
162
163
                    switch(strtolower($attribute)) {
164
                        case Icon::APPLE_TOUCH:
165
                            $type = Icon::APPLE_TOUCH;
166
                            break;
167
                        default:
168
                            if(strpos(strtolower($attribute), 'icon') !== FALSE) {
169
                                $type = Icon::FAVICON;
170
                                $size = [];
171
                            }
172
                    };
173
174
                    if(!empty($type) && filter_var($href, FILTER_VALIDATE_URL)) {
175
                        $icons[] = new Icon($type, $href, $size);
176
                    }
177
                }
178
            }
179
        }
180
181
        //Sort the icons by width
182
        usort($icons, function($a, $b) {
183
            return $a->getWidth() - $b->getWidth();
184
        });
185
186
        //If it is empty, try and get one from the root
187
        if (empty($icons)) {
188
            $icons = $this->getFavicon($url);
189
        }
190
191
        return $icons;
192
    }
193
194
    private function getFavicon($url) {
195
196
        // Try /favicon.ico first.
197
        $info = $this->info("{$url}/favicon.ico");
198
        if ($info['status'] == '200') {
199
            $favicon = $info['url'];
200
        }
201
202
        // Make sure the favicon is an absolute URL.
203
        if (isset($favicon) && filter_var($favicon, FILTER_VALIDATE_URL) === false) {
204
            $favicon = $url . '/' . $favicon;
205
        }
206
207
        if (isset($favicon)) {
208
            return [
209
                new Icon(Icon::FAVICON, $favicon, [])
210
            ];
211
        }
212
213
        return [];
214
    }
215
216
    /**
217
     * @return string
218
     */
219
    public function getUrl()
220
    {
221
        return $this->url;
222
    }
223
224
    /**
225
     * @param string $url
226
     */
227
    public function setUrl($url)
228
    {
229
        $this->url = $url;
230
    }
231
232
    /**
233
     * @param DataAccess $dataAccess
234
     */
235
    public function setDataAccess($dataAccess)
236
    {
237
        $this->dataAccess = $dataAccess;
238
    }
239
}
240