SitemapParser   B
last analyzed

Complexity

Total Complexity 49

Size/Duplication

Total Lines 392
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 6

Importance

Changes 0
Metric Value
wmc 49
lcom 1
cbo 6
dl 0
loc 392
rs 8.48
c 0
b 0
f 0

16 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 9 2
A parseRecursive() 0 16 3
A addToQueue() 0 9 3
A getQueue() 0 5 1
B parse() 0 25 6
A clean() 0 5 1
A getContent() 0 19 5
A parseRobotstxt() 0 25 5
A addArray() 0 18 5
A fixMissingTags() 0 9 3
A generateXMLObject() 0 13 2
A parseString() 0 16 5
A isSitemapURL() 0 8 3
A parseJson() 0 10 3
A getSitemaps() 0 4 1
A getURLs() 0 4 1

How to fix   Complexity   

Complex Class

Complex classes like SitemapParser often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use SitemapParser, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
namespace vipnytt;
4
5
use GuzzleHttp;
6
use SimpleXMLElement;
7
use vipnytt\SitemapParser\Exceptions;
8
use vipnytt\SitemapParser\UrlParser;
9
10
/**
11
 * SitemapParser class
12
 *
13
 * @license https://opensource.org/licenses/MIT MIT license
14
 * @link https://github.com/VIPnytt/SitemapParser
15
 *
16
 * Specifications:
17
 * @link http://www.sitemaps.org/protocol.html
18
 */
19
class SitemapParser
20
{
21
    use UrlParser;
22
23
    /**
24
     * Default User-Agent
25
     */
26
    const DEFAULT_USER_AGENT = 'SitemapParser-VIPnytt/1.1 (+https://github.com/VIPnytt/SitemapParser/blob/master/README.md)';
27
28
    /**
29
     * Default encoding
30
     */
31
    const ENCODING = 'UTF-8';
32
33
    /**
34
     * XML file extension
35
     */
36
    const XML_EXTENSION = 'xml';
37
38
    /**
39
     * Compressed XML file extension
40
     */
41
    const XML_EXTENSION_COMPRESSED = 'xml.gz';
42
43
    /**
44
     * XML Sitemap tag
45
     */
46
    const XML_TAG_SITEMAP = 'sitemap';
47
48
    /**
49
     * XML URL tag
50
     */
51
    const XML_TAG_URL = 'url';
52
53
    /**
54
     * Robots.txt path
55
     */
56
    const ROBOTSTXT_PATH = '/robots.txt';
57
58
    /**
59
     * User-Agent to send with every HTTP(S) request
60
     * @var string
61
     */
62
    protected $userAgent;
63
64
    /**
65
     * Configuration options
66
     * @var array
67
     */
68
    protected $config = [];
69
70
    /**
71
     * Sitemaps discovered
72
     * @var array
73
     */
74
    protected $sitemaps = [];
75
76
    /**
77
     * URLs discovered
78
     * @var array
79
     */
80
    protected $urls = [];
81
82
    /**
83
     * Sitemap URLs discovered but not yet parsed
84
     * @var array
85
     */
86
    protected $queue = [];
87
88
    /**
89
     * Parsed URLs history
90
     * @var array
91
     */
92
    protected $history = [];
93
94
    /**
95
     * Current URL being parsed
96
     * @var null|string
97
     */
98
    protected $currentURL;
99
100
    /**
101
     * Constructor
102
     *
103
     * @param string $userAgent User-Agent to send with every HTTP(S) request
104
     * @param array $config Configuration options
105
     * @throws Exceptions\SitemapParserException
106
     */
107
    public function __construct($userAgent = self::DEFAULT_USER_AGENT, array $config = [])
108
    {
109
        mb_language("uni");
110
        if (!mb_internal_encoding(self::ENCODING)) {
111
            throw new Exceptions\SitemapParserException('Unable to set internal character encoding to `' . self::ENCODING . '`');
112
        }
113
        $this->userAgent = $userAgent;
114
        $this->config = $config;
115
    }
116
117
    /**
118
     * Parse Recursive
119
     *
120
     * @param string $url
121
     * @return void
122
     * @throws Exceptions\SitemapParserException
123
     */
124
    public function parseRecursive($url)
125
    {
126
        $this->addToQueue([$url]);
127
        while (count($todo = $this->getQueue()) > 0) {
128
            $sitemaps = $this->sitemaps;
129
            $urls = $this->urls;
130
            try {
131
                $this->parse($todo[0]);
132
            } catch (Exceptions\TransferException $e) {
133
                // Keep crawling
134
                continue;
135
            }
136
            $this->sitemaps = array_merge_recursive($sitemaps, $this->sitemaps);
137
            $this->urls = array_merge_recursive($urls, $this->urls);
138
        }
139
    }
140
141
    /**
142
     * Add an array of URLs to the parser queue
143
     *
144
     * @param array $urlArray
145
     */
146
    public function addToQueue(array $urlArray)
147
    {
148
        foreach ($urlArray as $url) {
149
            $url = $this->urlEncode($url);
150
            if ($this->urlValidate($url)) {
151
                $this->queue[] = $url;
152
            }
153
        }
154
    }
155
156
    /**
157
     * Sitemap URLs discovered but not yet parsed
158
     *
159
     * @return array
160
     */
161
    public function getQueue()
162
    {
163
        $this->queue = array_values(array_diff(array_unique(array_merge($this->queue, array_keys($this->sitemaps))), $this->history));
164
        return $this->queue;
165
    }
166
167
    /**
168
     * Parse
169
     *
170
     * @param string $url URL to parse
171
     * @param string|null $urlContent URL body content (provide to skip download)
172
     * @return void
173
     * @throws Exceptions\TransferException
174
     * @throws Exceptions\SitemapParserException
175
     */
176
    public function parse($url, $urlContent = null)
177
    {
178
        $this->clean();
179
        $this->currentURL = $this->urlEncode($url);
180
        if (!$this->urlValidate($this->currentURL)) {
181
            throw new Exceptions\SitemapParserException('Invalid URL');
182
        }
183
        $this->history[] = $this->currentURL;
184
        $response = is_string($urlContent) ? $urlContent : $this->getContent();
185
        if (parse_url($this->currentURL, PHP_URL_PATH) === self::ROBOTSTXT_PATH) {
186
            $this->parseRobotstxt($response);
187
            return;
188
        }
189
        // Check if content is an gzip file
190
        if (mb_strpos($response, "\x1f\x8b\x08", 0, "US-ASCII") === 0) {
191
            $response = gzdecode($response);
192
        }
193
        $sitemapJson = $this->generateXMLObject($response);
194
        if ($sitemapJson instanceof SimpleXMLElement === false) {
195
            $this->parseString($response);
196
            return;
197
        }
198
        $this->parseJson(self::XML_TAG_SITEMAP, $sitemapJson);
199
        $this->parseJson(self::XML_TAG_URL, $sitemapJson);
200
    }
201
202
    /**
203
     * Cleanup between each parse
204
     *
205
     * @return void
206
     */
207
    protected function clean()
208
    {
209
        $this->sitemaps = [];
210
        $this->urls = [];
211
    }
212
213
    /**
214
     * Request the body content of an URL
215
     *
216
     * @return string Raw body content
217
     * @throws Exceptions\TransferException
218
     * @throws Exceptions\SitemapParserException
219
     */
220
    protected function getContent()
221
    {
222
        $this->currentURL = $this->urlEncode($this->currentURL);
223
        if (!$this->urlValidate($this->currentURL)) {
224
            throw new Exceptions\SitemapParserException('Invalid URL');
225
        }
226
        try {
227
            if (!isset($this->config['guzzle']['headers']['User-Agent'])) {
228
                $this->config['guzzle']['headers']['User-Agent'] = $this->userAgent;
229
            }
230
            $client = new GuzzleHttp\Client();
231
            $res = $client->request('GET', $this->currentURL, $this->config['guzzle']);
232
            return $res->getBody()->getContents();
233
        } catch (GuzzleHttp\Exception\TransferException $e) {
234
            throw new Exceptions\TransferException('Unable to fetch URL contents', 0, $e);
235
        } catch (GuzzleHttp\Exception\GuzzleException $e) {
236
            throw new Exceptions\SitemapParserException('GuzzleHttp exception', 0, $e);
237
        }
238
    }
239
240
    /**
241
     * Search for sitemaps in the robots.txt content
242
     *
243
     * @param string $robotstxt
244
     * @return bool
245
     */
246
    protected function parseRobotstxt($robotstxt)
247
    {
248
        // Split lines into array
249
        $lines = array_filter(array_map('trim', mb_split('\r\n|\n|\r', $robotstxt)));
250
        // Parse each line individually
251
        foreach ($lines as $line) {
252
            // Remove comments
253
            $line = mb_split('#', $line, 2)[0];
254
            // Split by directive and rule
255
            $pair = array_map('trim', mb_split(':', $line, 2));
256
            // Check if the line contains a sitemap
257
            if (
258
                mb_strtolower($pair[0]) !== self::XML_TAG_SITEMAP ||
259
                empty($pair[1])
260
            ) {
261
                // Line does not contain any supported directive
262
                continue;
263
            }
264
            $url = $this->urlEncode($pair[1]);
265
            if ($this->urlValidate($url)) {
266
                $this->addArray(self::XML_TAG_SITEMAP, ['loc' => $url]);
267
            }
268
        }
269
        return true;
270
    }
271
272
    /**
273
     * Validate URL arrays and add them to their corresponding arrays
274
     *
275
     * @param string $type sitemap|url
276
     * @param array $array Tag array
277
     * @return bool
278
     */
279
    protected function addArray($type, array $array)
280
    {
281
        if (!isset($array['loc'])) {
282
            return false;
283
        }
284
        $array['loc'] = $this->urlEncode(trim($array['loc']));
285
        if ($this->urlValidate($array['loc'])) {
286
            switch ($type) {
287
                case self::XML_TAG_SITEMAP:
288
                    $this->sitemaps[$array['loc']] = $this->fixMissingTags(['lastmod'], $array);
289
                    return true;
290
                case self::XML_TAG_URL:
291
                    $this->urls[$array['loc']] = $this->fixMissingTags(['lastmod', 'changefreq', 'priority'], $array);
292
                    return true;
293
            }
294
        }
295
        return false;
296
    }
297
298
    /**
299
     * Check for missing values and set them to null
300
     *
301
     * @param array $tags Tags check if exists
302
     * @param array $array Array to check
303
     * @return array
304
     */
305
    protected function fixMissingTags(array $tags, array $array)
306
    {
307
        foreach ($tags as $tag) {
308
            if (empty($array[$tag])) {
309
                $array[$tag] = null;
310
            }
311
        }
312
        return $array;
313
    }
314
315
    /**
316
     * Generate the \SimpleXMLElement object if the XML is valid
317
     *
318
     * @param string $xml
319
     * @return \SimpleXMLElement|false
320
     */
321
    protected function generateXMLObject($xml)
322
    {
323
        // strip XML comments from files
324
        // if they occur at the beginning of the file it will invalidate the XML
325
        // this occurs with certain versions of Yoast
326
        $xml = preg_replace('/\s*\<\!\-\-((?!\-\-\>)[\s\S])*\-\-\>\s*/', '', (string)$xml);
327
        try {
328
            libxml_use_internal_errors(true);
329
            return new SimpleXMLElement($xml, LIBXML_NOCDATA);
330
        } catch (\Exception $e) {
331
            return false;
332
        }
333
    }
334
335
    /**
336
     * Parse line separated text string
337
     *
338
     * @param string $string
339
     * @return bool
340
     */
341
    protected function parseString($string)
342
    {
343
        if (!isset($this->config['strict']) || $this->config['strict'] !== false) {
344
            // Strings are not part of any documented sitemap standard
345
            return false;
346
        }
347
        $array = array_filter(array_map('trim', mb_split('\r\n|\n|\r', $string)));
348
        foreach ($array as $line) {
349
            if ($this->isSitemapURL($line)) {
350
                $this->addArray(self::XML_TAG_SITEMAP, ['loc' => $line]);
351
                continue;
352
            }
353
            $this->addArray(self::XML_TAG_URL, ['loc' => $line]);
354
        }
355
        return true;
356
    }
357
358
    /**
359
     * Check if the URL may contain an Sitemap
360
     *
361
     * @param string $url
362
     * @return bool
363
     */
364
    protected function isSitemapURL($url)
365
    {
366
        $path = parse_url($this->urlEncode($url), PHP_URL_PATH);
367
        return $this->urlValidate($url) && (
368
                mb_substr($path, -mb_strlen(self::XML_EXTENSION) - 1) == '.' . self::XML_EXTENSION ||
369
                mb_substr($path, -mb_strlen(self::XML_EXTENSION_COMPRESSED) - 1) == '.' . self::XML_EXTENSION_COMPRESSED
370
            );
371
    }
372
373
    /**
374
     * Parse Json object
375
     *
376
     * @param string $type Sitemap or URL
377
     * @param \SimpleXMLElement $json object
378
     * @return bool
379
     */
380
    protected function parseJson($type, \SimpleXMLElement $json)
381
    {
382
        if (!isset($json->$type)) {
383
            return false;
384
        }
385
        foreach ($json->$type as $url) {
386
            $this->addArray($type, (array)$url);
387
        }
388
        return true;
389
    }
390
391
    /**
392
     * Sitemaps discovered
393
     *
394
     * @return array
395
     */
396
    public function getSitemaps()
397
    {
398
        return $this->sitemaps;
399
    }
400
401
    /**
402
     * URLs discovered
403
     *
404
     * @return array
405
     */
406
    public function getURLs()
407
    {
408
        return $this->urls;
409
    }
410
}
411