Completed
Push — master ( 9f08bd...d4c5d1 )
by Jan-Petter
06:45 queued 10s
created

SitemapParser::clean()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 5
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
c 2
b 0
f 0
dl 0
loc 5
rs 9.4285
cc 1
eloc 3
nc 1
nop 0
1
<?php
2
namespace vipnytt;
3
4
use GuzzleHttp;
5
use SimpleXMLElement;
6
use vipnytt\SitemapParser\Exceptions\SitemapParserException;
7
use vipnytt\SitemapParser\UrlParser;
8
9
/**
10
 * SitemapParser class
11
 *
12
 * @license https://opensource.org/licenses/MIT MIT license
13
 * @link https://github.com/VIPnytt/SitemapParser
14
 *
15
 * Specifications:
16
 * @link http://www.sitemaps.org/protocol.html
17
 */
18
class SitemapParser
19
{
20
    use UrlParser;
21
22
    /**
23
     * Default User-Agent
24
     */
25
    const DEFAULT_USER_AGENT = 'SitemapParser';
26
27
    /**
28
     * Default encoding
29
     */
30
    const ENCODING = 'UTF-8';
31
32
    /**
33
     * XML file extension
34
     */
35
    const XML_EXTENSION = 'xml';
36
37
    /**
38
     * Compressed XML file extension
39
     */
40
    const XML_EXTENSION_COMPRESSED = 'xml.gz';
41
42
    /**
43
     * XML Sitemap tag
44
     */
45
    const XML_TAG_SITEMAP = 'sitemap';
46
47
    /**
48
     * XML URL tag
49
     */
50
    const XML_TAG_URL = 'url';
51
52
    /**
53
     * Robots.txt path
54
     */
55
    const ROBOTSTXT_PATH = '/robots.txt';
56
57
    /**
58
     * User-Agent to send with every HTTP(S) request
59
     * @var string
60
     */
61
    protected $userAgent = self::DEFAULT_USER_AGENT;
62
63
    /**
64
     * Configuration options
65
     * @var array
66
     */
67
    protected $config = [];
68
69
    /**
70
     * Sitemaps discovered
71
     * @var array
72
     */
73
    protected $sitemaps = [];
74
75
    /**
76
     * URLs discovered
77
     * @var array
78
     */
79
    protected $urls = [];
80
81
    /**
82
     * Sitemap URLs discovered but not yet parsed
83
     * @var array
84
     */
85
    protected $queue = [];
86
87
    /**
88
     * Parsed URLs history
89
     * @var array
90
     */
91
    protected $history = [];
92
93
    /**
94
     * Current URL being parsed
95
     * @var null|string
96
     */
97
    protected $currentURL;
98
99
    /**
100
     * Constructor
101
     *
102
     * @param string $userAgent User-Agent to send with every HTTP(S) request
103
     * @param array $config Configuration options
104
     * @throws SitemapParserException
105
     */
106
    public function __construct($userAgent = self::DEFAULT_USER_AGENT, array $config = [])
107
    {
108
        mb_language("uni");
109
        if (!mb_internal_encoding(self::ENCODING)) {
110
            throw new SitemapParserException('Unable to set internal character encoding to `' . self::ENCODING . '`');
111
        }
112
        $this->userAgent = $userAgent;
113
        $this->config = $config;
114
    }
115
116
    /**
117
     * Parse Recursive
118
     *
119
     * @param string $url
120
     * @return void
121
     * @throws SitemapParserException
122
     */
123
    public function parseRecursive($url)
124
    {
125
        $this->addToQueue([$url]);
126
        while (count($todo = $this->getQueue()) > 0) {
127
            $sitemaps = $this->sitemaps;
128
            $urls = $this->urls;
129
            $this->parse($todo[0]);
130
            $this->sitemaps = array_merge_recursive($sitemaps, $this->sitemaps);
131
            $this->urls = array_merge_recursive($urls, $this->urls);
132
        }
133
    }
134
135
    /**
136
     * Add an array of URLs to the parser queue
137
     *
138
     * @param array $urlArray
139
     */
140
    public function addToQueue(array $urlArray)
141
    {
142
        foreach ($urlArray as $url) {
143
            $this->queue[] = $url;
144
        }
145
    }
146
147
    /**
148
     * Sitemap URLs discovered but not yet parsed
149
     *
150
     * @return array
151
     */
152
    public function getQueue()
153
    {
154
        $this->queue = array_values(array_diff(array_unique(array_merge($this->queue, array_keys($this->sitemaps))), $this->history));
155
        return $this->queue;
156
    }
157
158
    /**
159
     * Parse
160
     *
161
     * @param string $url URL to parse
162
     * @param string|null $urlContent URL body content (provide to skip download)
163
     * @return void
164
     * @throws SitemapParserException
165
     */
166
    public function parse($url, $urlContent = null)
167
    {
168
        $this->clean();
169
        $this->currentURL = $url;
170
        $response = (is_string($urlContent)) ? $urlContent : $this->getContent();
171
        $this->history[] = $this->currentURL;
172
        if ($this->urlValidate($this->currentURL) && parse_url($this->currentURL, PHP_URL_PATH) === self::ROBOTSTXT_PATH) {
173
            $this->parseRobotstxt($response);
174
            return;
175
        }
176
        // Check if content is an gzip file
177
        if (mb_strpos($response, "\x1f\x8b\x08", 0, "US-ASCII") === 0) {
178
            $response = gzdecode($response);
179
        }
180
        $sitemapJson = $this->generateXMLObject($response);
181
        if ($sitemapJson instanceof SimpleXMLElement === false) {
182
            $this->parseString($response);
183
            return;
184
        }
185
        $this->parseJson(self::XML_TAG_SITEMAP, $sitemapJson);
186
        $this->parseJson(self::XML_TAG_URL, $sitemapJson);
187
    }
188
189
    /**
190
     * Cleanup between each parse
191
     *
192
     * @return void
193
     */
194
    protected function clean()
195
    {
196
        $this->sitemaps = [];
197
        $this->urls = [];
198
    }
199
200
    /**
201
     * Request the body content of an URL
202
     *
203
     * @return string Raw body content
204
     * @throws SitemapParserException
205
     */
206
    protected function getContent()
207
    {
208
        $this->currentURL = $this->urlEncode($this->currentURL);
209
        if (!$this->urlValidate($this->currentURL)) {
210
            throw new SitemapParserException('Invalid URL');
211
        }
212
        try {
213
            if (!isset($this->config['guzzle']['headers']['User-Agent'])) {
214
                $this->config['guzzle']['headers']['User-Agent'] = $this->userAgent;
215
            }
216
            $client = new GuzzleHttp\Client();
217
            $res = $client->request('GET', $this->currentURL, $this->config['guzzle']);
218
            return $res->getBody();
219
        } catch (GuzzleHttp\Exception\TransferException $e) {
220
            throw new SitemapParserException($e->getMessage());
221
        }
222
    }
223
224
    /**
225
     * Search for sitemaps in the robots.txt content
226
     *
227
     * @param string $robotstxt
228
     * @return bool
229
     */
230
    protected function parseRobotstxt($robotstxt)
231
    {
232
        // Split lines into array
233
        $lines = array_filter(array_map('trim', mb_split('\r\n|\n|\r', $robotstxt)));
234
        // Parse each line individually
235
        foreach ($lines as $line) {
236
            // Remove comments
237
            $line = mb_split('#', $line, 2)[0];
238
            // Split by directive and rule
239
            $pair = array_map('trim', mb_split(':', $line, 2));
240
            // Check if the line contains a sitemap
241
            if (
242
                mb_strtolower($pair[0]) !== self::XML_TAG_SITEMAP ||
243
                empty($pair[1])
244
            ) {
245
                // Line does not contain any supported directive
246
                continue;
247
            }
248
            $url = $this->urlEncode($pair[1]);
249
            if ($this->urlValidate($url)) {
250
                $this->addArray(self::XML_TAG_SITEMAP, ['loc' => $url]);
251
            }
252
        }
253
        return true;
254
    }
255
256
    /**
257
     * Validate URL arrays and add them to their corresponding arrays
258
     *
259
     * @param string $type sitemap|url
260
     * @param array $array Tag array
261
     * @return bool
262
     */
263
    protected function addArray($type, array $array)
264
    {
265
        if (!isset($array['loc'])) {
266
            return false;
267
        }
268
        $array['loc'] = $this->urlEncode($array['loc']);
269
        if ($this->urlValidate($array['loc'])) {
270
            switch ($type) {
271
                case self::XML_TAG_SITEMAP:
272
                    $this->sitemaps[$array['loc']] = $this->fixMissingTags(['lastmod', 'changefreq', 'priority'], $array);
273
                    return true;
274
                case self::XML_TAG_URL:
275
                    $this->urls[$array['loc']] = $this->fixMissingTags(['lastmod'], $array);
276
                    return true;
277
            }
278
        }
279
        return false;
280
    }
281
282
    /**
283
     * Check for missing values and set them to null
284
     *
285
     * @param array $tags Tags check if exists
286
     * @param array $array Array to check
287
     * @return array
288
     */
289
    protected function fixMissingTags(array $tags, array $array)
290
    {
291
        foreach ($tags as $tag) {
292
            if (empty($array)) {
293
                $array[$tag] = null;
294
            }
295
        }
296
        return $array;
297
    }
298
299
    /**
300
     * Generate the \SimpleXMLElement object if the XML is valid
301
     *
302
     * @param string $xml
303
     * @return \SimpleXMLElement|false
304
     */
305
    protected function generateXMLObject($xml)
306
    {
307
        try {
308
            libxml_use_internal_errors(true);
309
            return new SimpleXMLElement($xml, LIBXML_NOCDATA);
310
        } catch (\Exception $e) {
311
            return false;
312
        }
313
    }
314
315
    /**
316
     * Parse line separated text string
317
     *
318
     * @param string $string
319
     * @return bool
320
     */
321
    protected function parseString($string)
322
    {
323
        if (!isset($this->config['strict']) || $this->config['strict'] !== false) {
324
            // Strings are not part of any documented sitemap standard
325
            return false;
326
        }
327
        $array = array_filter(array_map('trim', mb_split('\r\n|\n|\r', $string)));
328
        foreach ($array as $line) {
329
            if ($this->isSitemapURL($line)) {
330
                $this->addArray(self::XML_TAG_SITEMAP, ['loc' => $line]);
331
                continue;
332
            }
333
            $this->addArray(self::XML_TAG_URL, ['loc' => $line]);
334
        }
335
        return true;
336
    }
337
338
    /**
339
     * Check if the URL may contain an Sitemap
340
     *
341
     * @param string $url
342
     * @return bool
343
     */
344
    protected function isSitemapURL($url)
345
    {
346
        $path = parse_url($this->urlEncode($url), PHP_URL_PATH);
347
        return $this->urlValidate($url) && (
348
            mb_substr($path, -mb_strlen(self::XML_EXTENSION) - 1) == '.' . self::XML_EXTENSION ||
349
            mb_substr($path, -mb_strlen(self::XML_EXTENSION_COMPRESSED) - 1) == '.' . self::XML_EXTENSION_COMPRESSED
350
        );
351
    }
352
353
    /**
354
     * Parse Json object
355
     *
356
     * @param string $type Sitemap or URL
357
     * @param \SimpleXMLElement $json object
358
     * @return bool
359
     */
360
    protected function parseJson($type, \SimpleXMLElement $json)
361
    {
362
        if (!isset($json->$type)) {
363
            return false;
364
        }
365
        foreach ($json->$type as $url) {
366
            $this->addArray($type, (array)$url);
367
        }
368
        return true;
369
    }
370
371
    /**
372
     * Sitemaps discovered
373
     *
374
     * @return array
375
     */
376
    public function getSitemaps()
377
    {
378
        return $this->sitemaps;
379
    }
380
381
    /**
382
     * URLs discovered
383
     *
384
     * @return array
385
     */
386
    public function getURLs()
387
    {
388
        return $this->urls;
389
    }
390
}
391