Completed
Push — master ( ab9090...462d3d )
by Jan-Petter
02:22
created

SitemapParser::parse()   B

Complexity

Conditions 5
Paths 10

Size

Total Lines 22
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Importance

Changes 3
Bugs 1 Features 0
Metric Value
c 3
b 1
f 0
dl 0
loc 22
rs 8.6737
cc 5
eloc 16
nc 10
nop 2
1
<?php
2
namespace vipnytt;
3
4
use GuzzleHttp;
5
use SimpleXMLElement;
6
use vipnytt\SitemapParser\Exceptions\SitemapParserException;
7
8
/**
9
 * SitemapParser class
10
 *
11
 * @license https://opensource.org/licenses/MIT MIT license
12
 * @link https://github.com/VIPnytt/SitemapParser
13
 *
14
 * Specifications:
15
 * @link http://www.sitemaps.org/protocol.html
16
 */
17
class SitemapParser
18
{
19
    /**
20
     * Default encoding
21
     */
22
    const ENCODING = 'UTF-8';
23
24
    /**
25
     * XML file extension
26
     */
27
    const XML_EXTENSION = '.xml';
28
29
    /**
30
     * Compressed XML file extension
31
     */
32
    const XML_EXTENSION_COMPRESSED = '.xml.gz';
33
34
    /**
35
     * XML Sitemap tag
36
     */
37
    const XML_TAG_SITEMAP = 'sitemap';
38
39
    /**
40
     * XML URL tag
41
     */
42
    const XML_TAG_URL = 'url';
43
44
    /**
45
     * Robots.txt path
46
     */
47
    const ROBOTSTXT_PATH = '/robots.txt';
48
49
    /**
50
     * Robots.txt sitemap prefix
51
     */
52
    const ROBOTSTXT_PREFIX = 'Sitemap:';
53
54
    /**
55
     * User-Agent to send with every HTTP(S) request
56
     * @var string
57
     */
58
    protected $userAgent;
59
60
    /**
61
     * Configuration options
62
     * @var array
63
     */
64
    protected $config = [];
65
66
    /**
67
     * Sitemaps discovered
68
     * @var array
69
     */
70
    protected $sitemaps = [];
71
72
    /**
73
     * URLs discovered
74
     * @var array
75
     */
76
    protected $urls = [];
77
78
    /**
79
     * Sitemap URLs discovered but not yet parsed
80
     * @var array
81
     */
82
    protected $queue = [];
83
84
    /**
85
     * Parsed URLs history
86
     * @var array
87
     */
88
    protected $history = [];
89
90
    /**
91
     * Current URL being parsed
92
     * @var null|string
93
     */
94
    protected $currentURL;
95
96
    /**
97
     * Constructor
98
     *
99
     * @param string $userAgent User-Agent to send with every HTTP(S) request
100
     * @param array $config Configuration options
101
     * @throws SitemapParserException
102
     */
103
    public function __construct($userAgent = 'SitemapParser', $config = [])
104
    {
105
        if (!extension_loaded('simplexml')) {
106
            throw new SitemapParserException('The extension `simplexml` must be installed and loaded for this library');
107
        }
108
        if (!extension_loaded('mbstring')) {
109
            throw new SitemapParserException('The extension `mbstring` must be installed and loaded for this library');
110
        }
111
        mb_language("uni");
112
        if (!mb_internal_encoding(self::ENCODING)) {
113
            throw new SitemapParserException('Unable to set internal character encoding to `' . self::ENCODING . '`');
114
        }
115
        $this->userAgent = $userAgent;
116
        $this->config = $config;
117
    }
118
119
    /**
120
     * Parse Recursive
121
     *
122
     * @param string $url
123
     * @return void
124
     * @throws SitemapParserException
125
     */
126
    public function parseRecursive($url)
127
    {
128
        $this->addToQueue([$url]);
129
        while (count($todo = $this->getQueue()) > 0) {
130
            $sitemaps = $this->sitemaps;
131
            $urls = $this->urls;
132
            $this->parse($todo[0]);
133
            $this->sitemaps = array_merge_recursive($sitemaps, $this->sitemaps);
134
            $this->urls = array_merge_recursive($urls, $this->urls);
135
        }
136
    }
137
138
    /**
139
     * Add an array of URLs to the parser queue
140
     *
141
     * @param array $urlArray
142
     */
143
    public function addToQueue($urlArray)
144
    {
145
        foreach ($urlArray as $url) {
146
            $this->queue[] = $url;
147
        }
148
    }
149
150
    /**
151
     * Sitemap URLs discovered but not yet parsed
152
     *
153
     * @return array
154
     */
155
    public function getQueue()
156
    {
157
        $this->queue = array_values(array_diff(array_unique(array_merge($this->queue, array_keys($this->sitemaps))), $this->history));
158
        return $this->queue;
159
    }
160
161
    /**
162
     * Parse
163
     *
164
     * @param string $url URL to parse
165
     * @param string|null $urlContent URL body content (skip download)
166
     * @return void
167
     * @throws SitemapParserException
168
     */
169
    public function parse($url, $urlContent = null)
170
    {
171
        $this->clean();
172
        $this->currentURL = $url;
173
        $response = (is_string($urlContent)) ? $urlContent : $this->getContent();
174
        $this->history[] = $this->currentURL;
175
        if (parse_url($this->currentURL, PHP_URL_PATH) === self::ROBOTSTXT_PATH) {
176
            $this->parseRobotstxt($response);
177
            return;
178
        }
179
        // Check if content is an gzip file
180
        if (mb_strpos($response, "\x1f\x8b\x08", 0, "US-ASCII") === 0) {
181
            $response = gzdecode($response);
182
        }
183
        $sitemapJson = $this->generateXMLObject($response);
184
        if ($sitemapJson instanceof SimpleXMLElement === false) {
185
            $this->parseString($response);
186
            return;
187
        }
188
        $this->parseJson(self::XML_TAG_SITEMAP, $sitemapJson);
189
        $this->parseJson(self::XML_TAG_URL, $sitemapJson);
190
    }
191
192
    /**
193
     * Cleanup between each parse
194
     *
195
     * @return void
196
     */
197
    protected function clean()
198
    {
199
        $this->sitemaps = [];
200
        $this->urls = [];
201
    }
202
203
    /**
204
     * Request the body content of an URL
205
     *
206
     * @return string Raw body content
207
     * @throws SitemapParserException
208
     */
209
    protected function getContent()
210
    {
211
        if (!filter_var($this->currentURL, FILTER_VALIDATE_URL)) {
212
            throw new SitemapParserException('Passed URL not valid according to filter_var function');
213
        }
214
        try {
215
            if (!isset($this->config['guzzle']['headers']['User-Agent'])) {
216
                $this->config['guzzle']['headers']['User-Agent'] = $this->userAgent;
217
            }
218
            $client = new GuzzleHttp\Client();
219
            $res = $client->request('GET', $this->currentURL, $this->config['guzzle']);
220
            return $res->getBody();
221
        } catch (GuzzleHttp\Exception\TransferException $e) {
222
            throw new SitemapParserException($e->getMessage());
223
        }
224
    }
225
226
    /**
227
     * Search for sitemaps in the robots.txt content
228
     *
229
     * @param string $robotstxt
230
     * @return bool
231
     */
232
    protected function parseRobotstxt($robotstxt)
233
    {
234
        $array = array_map('trim', preg_split('/\R/', $robotstxt));
235
        foreach ($array as $line) {
236
            if (mb_stripos($line, self::ROBOTSTXT_PREFIX) === 0) {
237
                $url = mb_substr($line, mb_strlen(self::ROBOTSTXT_PREFIX));
238
                if (($pos = mb_stripos($url, '#')) !== false) {
239
                    $url = mb_substr($url, 0, $pos);
240
                }
241
                $url = preg_split('/\s+/', trim($url))[0];
242
                $this->addArray('sitemap', ['loc' => $url]);
243
            }
244
        }
245
        return true;
246
    }
247
248
    /**
249
     * Validate URL arrays and add them to their corresponding arrays
250
     *
251
     * @param string $type sitemap|url
252
     * @param array $array Tag array
253
     * @return bool
254
     */
255
    protected function addArray($type, $array)
256
    {
257
        if (isset($array['loc']) && filter_var($array['loc'], FILTER_VALIDATE_URL) !== false) {
258
            switch ($type) {
259
                case self::XML_TAG_SITEMAP:
260
                    $this->sitemaps[$array['loc']] = $array;
261
                    return true;
262
                case self::XML_TAG_URL:
263
                    $this->urls[$array['loc']] = $array;
264
                    return true;
265
            }
266
        }
267
        return false;
268
    }
269
270
    /**
271
     * Generate the \SimpleXMLElement object if the XML is valid
272
     *
273
     * @param string $xml
274
     * @return \SimpleXMLElement|false
275
     */
276
    protected function generateXMLObject($xml)
277
    {
278
        try {
279
            libxml_use_internal_errors(true);
280
            return new SimpleXMLElement($xml, LIBXML_NOCDATA);
281
        } catch (\Exception $e) {
282
            return false;
283
        }
284
    }
285
286
    /**
287
     * Parse line separated text string
288
     *
289
     * @param string $string
290
     * @return bool
291
     */
292
    protected function parseString($string)
293
    {
294
        if (!isset($this->config['strict']) || $this->config['strict'] !== false) {
295
            // Strings are not part of any documented sitemap standard
296
            return false;
297
        }
298
        $array = array_map('trim', preg_split('/\R/', $string));
299
        foreach ($array as $line) {
300
            if ($this->isSitemapURL($line)) {
301
                $this->addArray(self::XML_TAG_SITEMAP, ['loc' => $line]);
302
                continue;
303
            }
304
            $this->addArray(self::XML_TAG_URL, ['loc' => $line]);
305
        }
306
        return true;
307
    }
308
309
    /**
310
     * Check if the URL may contain an Sitemap
311
     *
312
     * @param string $url
313
     * @return bool
314
     */
315
    protected function isSitemapURL($url)
316
    {
317
        $path = parse_url($url, PHP_URL_PATH);
318
        return filter_var($url, FILTER_VALIDATE_URL) !== false && (
319
            substr($path, -strlen(self::XML_EXTENSION)) === self::XML_EXTENSION ||
320
            substr($path, -strlen(self::XML_EXTENSION_COMPRESSED)) === self::XML_EXTENSION_COMPRESSED
321
        );
322
    }
323
324
    /**
325
     * Parse Json object
326
     *
327
     * @param string $type Sitemap or URL
328
     * @param \SimpleXMLElement $json object
329
     * @return bool
330
     */
331
    protected function parseJson($type, $json)
332
    {
333
        if (!isset($json->$type)) {
334
            return false;
335
        }
336
        foreach ($json->$type as $url) {
337
            $this->addArray($type, (array)$url);
338
        }
339
        return true;
340
    }
341
342
    /**
343
     * Sitemaps discovered
344
     *
345
     * @return array
346
     */
347
    public function getSitemaps()
348
    {
349
        return $this->sitemaps;
350
    }
351
352
    /**
353
     * URLs discovered
354
     *
355
     * @return array
356
     */
357
    public function getURLs()
358
    {
359
        return $this->urls;
360
    }
361
}
362