Completed
Push — master ( 768f08...ab9090 )
by Jan-Petter
02:00
created

SitemapParser   B

Complexity

Total Complexity 44

Size/Duplication

Total Lines 308
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 3

Importance

Changes 6
Bugs 2 Features 0
Metric Value
wmc 44
c 6
b 2
f 0
lcom 1
cbo 3
dl 0
loc 308
rs 8.3396

15 Methods

Rating   Name   Duplication   Size   Complexity  
A parseRobotstxt() 0 10 3
A __construct() 0 15 4
A parseRecursive() 0 11 2
A addToQueue() 0 6 2
A getQueue() 0 5 1
C parse() 0 26 7
A clean() 0 5 1
A getContent() 0 16 4
B addArray() 0 14 5
A generateXMLObject() 0 9 2
A isSitemapURL() 0 8 3
A parseJson() 0 6 2
B parseString() 0 19 6
A getSitemaps() 0 4 1
A getURLs() 0 4 1

How to fix   Complexity   

Complex Class

Complex classes like SitemapParser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use SitemapParser, and based on these observations, apply Extract Interface, too.

1
<?php
2
namespace vipnytt;
3
4
use GuzzleHttp;
5
use SimpleXMLElement;
6
use vipnytt\SitemapParser\Exceptions\SitemapParserException;
7
8
/**
9
 * SitemapParser class
10
 *
11
 * @license https://opensource.org/licenses/MIT MIT license
12
 * @link https://github.com/VIPnytt/SitemapParser
13
 *
14
 * Specifications:
15
 * @link http://www.sitemaps.org/protocol.html
16
 */
17
class SitemapParser
18
{
19
    /**
20
     * User-Agent to send with every HTTP(S) request
21
     * @var string
22
     */
23
    protected $userAgent;
24
25
    /**
26
     * Configuration options
27
     * @var array
28
     */
29
    protected $config = [];
30
31
    /**
32
     * Sitemaps discovered
33
     * @var array
34
     */
35
    protected $sitemaps = [];
36
37
    /**
38
     * URLs discovered
39
     * @var array
40
     */
41
    protected $urls = [];
42
43
    /**
44
     * Sitemap URLs discovered but not yet parsed
45
     * @var array
46
     */
47
    protected $queue = [];
48
49
    /**
50
     * Parsed URLs history
51
     * @var array
52
     */
53
    protected $history = [];
54
55
    /**
56
     * Current URL being parsed
57
     * @var null|string
58
     */
59
    protected $currentURL;
60
61
    /**
62
     * Constructor
63
     *
64
     * @param string $userAgent User-Agent to send with every HTTP(S) request
65
     * @param array $config Configuration options
66
     * @throws SitemapParserException
67
     */
68
    public function __construct($userAgent = 'SitemapParser', $config = [])
69
    {
70
        if (!extension_loaded('simplexml')) {
71
            throw new SitemapParserException('The extension `simplexml` must be installed and loaded for this library');
72
        }
73
        if (!extension_loaded('mbstring')) {
74
            throw new SitemapParserException('The extension `mbstring` must be installed and loaded for this library');
75
        }
76
        mb_language("uni");
77
        if (!mb_internal_encoding('UTF-8')) {
78
            throw new SitemapParserException('Unable to set internal character encoding to UTF-8');
79
        }
80
        $this->userAgent = $userAgent;
81
        $this->config = $config;
82
    }
83
84
    /**
85
     * Parse Recursive
86
     *
87
     * @param string $url
88
     * @return void
89
     * @throws SitemapParserException
90
     */
91
    public function parseRecursive($url)
92
    {
93
        $this->addToQueue([$url]);
94
        while (count($todo = $this->getQueue()) > 0) {
95
            $sitemaps = $this->sitemaps;
96
            $urls = $this->urls;
97
            $this->parse($todo[0]);
98
            $this->sitemaps = array_merge_recursive($sitemaps, $this->sitemaps);
99
            $this->urls = array_merge_recursive($urls, $this->urls);
100
        }
101
    }
102
103
    /**
104
     * Add an array of URLs to the parser queue
105
     *
106
     * @param array $urlArray
107
     */
108
    public function addToQueue($urlArray)
109
    {
110
        foreach ($urlArray as $url) {
111
            $this->queue[] = $url;
112
        }
113
    }
114
115
    /**
116
     * Sitemap URLs discovered but not yet parsed
117
     *
118
     * @return array
119
     */
120
    public function getQueue()
121
    {
122
        $this->queue = array_values(array_diff(array_unique(array_merge($this->queue, array_keys($this->sitemaps))), $this->history));
123
        return $this->queue;
124
    }
125
126
    /**
127
     * Parse
128
     *
129
     * @param string $url URL to parse
130
     * @param string|null $urlContent URL body content (skip download)
131
     * @return void
132
     * @throws SitemapParserException
133
     */
134
    public function parse($url, $urlContent = null)
135
    {
136
        $this->clean();
137
        $this->currentURL = $url;
138
        $response = (is_string($urlContent)) ? $urlContent : $this->getContent();
139
        $this->history[] = $this->currentURL;
140
        if (parse_url($this->currentURL, PHP_URL_PATH) == '/robots.txt') {
141
            $this->parseRobotstxt($response);
142
            return;
143
        }
144
        // Check if content is an gzip file
145
        if (mb_strpos($response, "\x1f\x8b\x08", 0, "US-ASCII") === 0) {
146
            $response = gzdecode($response);
147
        }
148
        $sitemapJson = $this->generateXMLObject($response);
149
        if ($sitemapJson instanceof SimpleXMLElement === false) {
150
            $this->parseString($response);
151
            return;
152
        }
153
        if (isset($sitemapJson->sitemap)) {
154
            $this->parseJson('sitemap', $sitemapJson->sitemap);
155
        }
156
        if (isset($sitemapJson->url)) {
157
            $this->parseJson('url', $sitemapJson->url);
158
        }
159
    }
160
161
    /**
162
     * Cleanup between each parse
163
     *
164
     * @return void
165
     */
166
    protected function clean()
167
    {
168
        $this->sitemaps = [];
169
        $this->urls = [];
170
    }
171
172
    /**
173
     * Request the body content of an URL
174
     *
175
     * @return string Raw body content
176
     * @throws SitemapParserException
177
     */
178
    protected function getContent()
179
    {
180
        if (!filter_var($this->currentURL, FILTER_VALIDATE_URL)) {
181
            throw new SitemapParserException('Passed URL not valid according to filter_var function');
182
        }
183
        try {
184
            if (!isset($this->config['guzzle']['headers']['User-Agent'])) {
185
                $this->config['guzzle']['headers']['User-Agent'] = $this->userAgent;
186
            }
187
            $client = new GuzzleHttp\Client();
188
            $res = $client->request('GET', $this->currentURL, $this->config['guzzle']);
189
            return $res->getBody();
190
        } catch (GuzzleHttp\Exception\TransferException $e) {
191
            throw new SitemapParserException($e->getMessage());
192
        }
193
    }
194
195
    /**
196
     * Search for sitemaps in the robots.txt content
197
     *
198
     * @param string $robotstxt
199
     * @return void
200
     */
201
    protected function parseRobotstxt($robotstxt)
202
    {
203
        preg_match_all('#Sitemap:*(.*)#', $robotstxt, $match);
204
        if (isset($match[1])) {
205
            foreach ($match[1] as $sitemap) {
206
                $sitemap = trim($sitemap);
207
                $this->addArray('sitemap', ['loc' => $sitemap]);
208
            }
209
        }
210
    }
211
212
    /**
213
     * Validate URL arrays and add them to their corresponding arrays
214
     *
215
     * @param string $type sitemap|url
216
     * @param array $array Tag array
217
     * @return bool
218
     */
219
    protected function addArray($type, $array)
220
    {
221
        if (isset($array['loc']) && filter_var($array['loc'], FILTER_VALIDATE_URL) !== false) {
222
            switch ($type) {
223
                case 'sitemap':
224
                    $this->sitemaps[$array['loc']] = $array;
225
                    return true;
226
                case 'url':
227
                    $this->urls[$array['loc']] = $array;
228
                    return true;
229
            }
230
        }
231
        return false;
232
    }
233
234
    /**
235
     * Generate the \SimpleXMLElement object if the XML is valid
236
     *
237
     * @param string $xml
238
     * @return \SimpleXMLElement|false
239
     */
240
    protected function generateXMLObject($xml)
241
    {
242
        try {
243
            libxml_use_internal_errors(true);
244
            return new SimpleXMLElement($xml, LIBXML_NOCDATA);
245
        } catch (\Exception $e) {
246
            return false;
247
        }
248
    }
249
250
    /**
251
     * Parse plain text
252
     *
253
     * @param string $string
254
     * @return bool
255
     */
256
    protected function parseString($string)
257
    {
258
        if (!isset($this->config['strict']) || $this->config['strict'] !== false) {
259
            // Strings are not part of any sitemap standard
260
            return false;
261
        }
262
        $offset = 0;
263
        while (preg_match('/(\S+)/', $string, $match, PREG_OFFSET_CAPTURE, $offset)) {
264
            $offset = $match[0][1] + strlen($match[0][0]);
265
            if (filter_var($match[0][0], FILTER_VALIDATE_URL) !== false) {
266
                if ($this->isSitemapURL($match[0][0])) {
267
                    $this->addArray('sitemap', ['loc' => $match[0][0]]);
268
                    continue;
269
                }
270
                $this->addArray('url', ['loc' => $match[0][0]]);
271
            }
272
        }
273
        return true;
274
    }
275
276
    /**
277
     * Check if the URL may contain an Sitemap
278
     *
279
     * @param string $url
280
     * @return bool
281
     */
282
    protected function isSitemapURL($url)
283
    {
284
        $path = parse_url($url, PHP_URL_PATH);
285
        return filter_var($url, FILTER_VALIDATE_URL) !== false && (
286
            substr($path, -4) === ".xml" ||
287
            substr($path, -7) === '.xml.gz'
288
        );
289
    }
290
291
    /**
292
     * Parse Json object
293
     *
294
     * @param string $type Sitemap or URL
295
     * @param \SimpleXMLElement $json object
296
     * @return void
297
     */
298
    protected function parseJson($type, $json)
299
    {
300
        foreach ($json as $url) {
301
            $this->addArray($type, (array)$url);
302
        }
303
    }
304
305
    /**
306
     * Sitemaps discovered
307
     *
308
     * @return array
309
     */
310
    public function getSitemaps()
311
    {
312
        return $this->sitemaps;
313
    }
314
315
    /**
316
     * URLs discovered
317
     *
318
     * @return array
319
     */
320
    public function getURLs()
321
    {
322
        return $this->urls;
323
    }
324
}
325