Completed
Push — master ( 821e1a...112faa )
by Jan-Petter
06:55
created

SitemapParser::getContent()   A

Complexity

Conditions 4
Paths 10

Size

Total Lines 16
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 1 Features 0
Metric Value
c 2
b 1
f 0
dl 0
loc 16
rs 9.2
cc 4
eloc 11
nc 10
nop 0
1
<?php
2
namespace vipnytt;
3
4
use GuzzleHttp;
5
use SimpleXMLElement;
6
use vipnytt\SitemapParser\Exceptions\SitemapParserException;
7
8
/**
9
 * SitemapParser class
10
 *
11
 * @license https://opensource.org/licenses/MIT MIT license
12
 * @link https://github.com/VIPnytt/SitemapParser
13
 *
14
 * Specifications:
15
 * @link http://www.sitemaps.org/protocol.html
16
 */
17
class SitemapParser
18
{
19
    /**
20
     * User-Agent to send with every HTTP(S) request
21
     * @var string
22
     */
23
    protected $userAgent;
24
25
    /**
26
     * Configuration options
27
     * @var array
28
     */
29
    protected $config = [];
30
31
    /**
32
     * Sitemaps discovered
33
     * @var array
34
     */
35
    protected $sitemaps = [];
36
37
    /**
38
     * URLs discovered
39
     * @var array
40
     */
41
    protected $urls = [];
42
43
    /**
44
     * Sitemap URLs discovered but not yet parsed
45
     * @var array
46
     */
47
    protected $queue = [];
48
49
    /**
50
     * Parsed URLs history
51
     * @var array
52
     */
53
    protected $history = [];
54
55
    /**
56
     * Current URL being parsed
57
     * @var null|string
58
     */
59
    protected $currentURL = null;
60
61
    /**
62
     * Constructor
63
     *
64
     * @param string $userAgent User-Agent to send with every HTTP(S) request
65
     * @param array $config Configuration options
66
     * @throws SitemapParserException
67
     */
68
    public function __construct($userAgent = 'SitemapParser', $config = [])
69
    {
70
        if (!extension_loaded('simplexml')) {
71
            throw new SitemapParserException('The extension `simplexml` must be installed and loaded for this library');
72
        }
73
        if (!extension_loaded('mbstring')) {
74
            throw new SitemapParserException('The extension `mbstring` must be installed and loaded for this library');
75
        }
76
        mb_language("uni");
77
        if (!mb_internal_encoding('UTF-8')) {
78
            throw new SitemapParserException('Unable to set internal character encoding to UTF-8');
79
        }
80
        $this->config = $config;
81
        $this->userAgent = $userAgent;
82
    }
83
84
    /**
85
     * Parse Recursive
86
     *
87
     * @param string $url
88
     * @return void
89
     * @throws SitemapParserException
90
     */
91
    public function parseRecursive($url)
92
    {
93
        $this->addToQueue([$url]);
94
        while (count($todo = $this->getQueue()) > 0) {
95
            $sitemaps = $this->sitemaps;
96
            $urls = $this->urls;
97
            $this->parse($todo[0]);
98
            $this->sitemaps = array_merge_recursive($sitemaps, $this->sitemaps);
99
            $this->urls = array_merge_recursive($urls, $this->urls);
100
        }
101
    }
102
103
    /**
104
     * Add an array of URLs to the parser queue
105
     *
106
     * @param array $urlArray
107
     */
108
    public function addToQueue($urlArray)
109
    {
110
        foreach ($urlArray as $url) {
111
            $this->queue[] = $url;
112
        }
113
    }
114
115
    /**
116
     * Sitemap URLs discovered but not yet parsed
117
     *
118
     * @return array
119
     */
120
    public function getQueue()
121
    {
122
        $this->queue = array_values(array_diff(array_unique(array_merge($this->queue, array_keys($this->sitemaps))), $this->history));
123
        return $this->queue;
124
    }
125
126
    /**
127
     * Parse
128
     *
129
     * @param string $url URL to parse
130
     * @param string|null $urlContent URL body content (skip download)
131
     * @return void
132
     * @throws SitemapParserException
133
     */
134
    public function parse($url, $urlContent = null)
135
    {
136
        $this->clean();
137
        $this->currentURL = $url;
138
        $response = (is_string($urlContent)) ? $urlContent : $this->getContent();
139
        $this->history[] = $this->currentURL;
140
        if (parse_url($this->currentURL, PHP_URL_PATH) == '/robots.txt') {
141
            $this->parseRobotstxt($response);
142
            return;
143
        }
144
        // Check if content is an gzip file
145
        if (mb_strpos($response, "\x1f\x8b\x08", 0, "US-ASCII") === 0) {
146
            $response = gzdecode($response);
147
        }
148
        $sitemapJson = $this->generateXMLObject($response);
149
        if ($sitemapJson === false) {
150
            $this->parseString($response);
151
            return;
152
        }
153
        if (isset($sitemapJson->sitemap)) {
154
            $this->parseJson('sitemap', $sitemapJson->sitemap);
155
        }
156
        if (isset($sitemapJson->url)) {
157
            $this->parseJson('url', $sitemapJson->url);
158
        }
159
    }
160
161
    /**
162
     * Cleanup between each parse
163
     *
164
     * @return void
165
     */
166
    protected function clean()
167
    {
168
        $this->sitemaps = [];
169
        $this->urls = [];
170
    }
171
172
    /**
173
     * Request the body content of an URL
174
     *
175
     * @return string Raw body content
176
     * @throws SitemapParserException
177
     */
178
    protected function getContent()
179
    {
180
        if (!filter_var($this->currentURL, FILTER_VALIDATE_URL)) {
181
            throw new SitemapParserException('Passed URL not valid according to filter_var function');
182
        }
183
        try {
184
            if (!isset($this->config['guzzle']['headers']['User-Agent'])) {
185
                $this->config['guzzle']['headers']['User-Agent'] = $this->userAgent;
186
            }
187
            $client = new GuzzleHttp\Client();
188
            $res = $client->request('GET', $this->currentURL, $this->config['guzzle']);
189
            return $res->getBody();
190
        } catch (GuzzleHttp\Exception\TransferException $e) {
191
            throw new SitemapParserException($e->getMessage());
192
        }
193
    }
194
195
    /**
196
     * Search for sitemaps in the robots.txt content
197
     *
198
     * @param string $robotstxt
199
     * @return void
200
     */
201
    protected function parseRobotstxt($robotstxt)
202
    {
203
        preg_match_all('#Sitemap:*(.*)#', $robotstxt, $match);
204
        if (isset($match[1])) {
205
            foreach ($match[1] as $sitemap) {
206
                $sitemap = trim($sitemap);
207
                $this->addArray('sitemap', ['loc' => $sitemap]);
208
            }
209
        }
210
    }
211
212
    /**
213
     * Validate URL arrays and add them to their corresponding arrays
214
     *
215
     * @param string $type sitemap|url
216
     * @param array $array Tag array
217
     * @return bool
218
     */
219
    protected function addArray($type, $array)
220
    {
221
        if (isset($array['loc']) && filter_var($array['loc'], FILTER_VALIDATE_URL) !== false) {
222
            switch ($type) {
223
                case 'sitemap':
224
                    $this->sitemaps[$array['loc']] = $array;
225
                    return true;
226
                case 'url':
227
                    $this->urls[$array['loc']] = $array;
228
                    return true;
229
            }
230
        }
231
        return false;
232
    }
233
234
    /**
235
     * Generate the \SimpleXMLElement object if the XML is valid
236
     *
237
     * @param string $xml
238
     * @return \SimpleXMLElement|false
239
     */
240
    protected function generateXMLObject($xml)
241
    {
242
        libxml_use_internal_errors(true);
243
        $doc = simplexml_load_string($xml, 'SimpleXMLElement', LIBXML_NOCDATA);
244
        if ($doc === false) {
245
            libxml_clear_errors();
246
            return false;
247
        }
248
        return $doc;
249
    }
250
251
    /**
252
     * Parse plain text
253
     *
254
     * @param string $string
255
     * @return void
256
     */
257
    protected function parseString($string)
258
    {
259
        $offset = 0;
260
        while (preg_match('/(\S+)/', $string, $match, PREG_OFFSET_CAPTURE, $offset)) {
261
            $offset = $match[0][1] + strlen($match[0][0]);
262
            if (filter_var($match[0][0], FILTER_VALIDATE_URL) !== false) {
263
                if ($this->isSitemapURL($match[0][0])) {
264
                    $this->addArray('sitemap', ['loc' => $match[0][0]]);
265
                    continue;
266
                }
267
                $this->addArray('url', ['loc' => $match[0][0]]);
268
            }
269
        }
270
    }
271
272
    /**
273
     * Check if the URL may contain an Sitemap
274
     *
275
     * @param string $url
276
     * @return bool
277
     */
278
    protected function isSitemapURL($url)
279
    {
280
        $path = parse_url($url, PHP_URL_PATH);
281
        return filter_var($url, FILTER_VALIDATE_URL) !== false && (
282
            substr($path, -4) === ".xml" ||
283
            substr($path, -7) === '.xml.gz'
284
        );
285
    }
286
287
    /**
288
     * Parse Json object
289
     *
290
     * @param string $type Sitemap or URL
291
     * @param \SimpleXMLElement $json object
292
     * @return void
293
     */
294
    protected function parseJson($type, $json)
295
    {
296
        foreach ($json as $url) {
297
            $this->addArray($type, (array)$url);
298
        }
299
    }
300
301
    /**
302
     * Sitemaps discovered
303
     *
304
     * @return array
305
     */
306
    public function getSitemaps()
307
    {
308
        return $this->sitemaps;
309
    }
310
311
    /**
312
     * URLs discovered
313
     *
314
     * @return array
315
     */
316
    public function getURLs()
317
    {
318
        return $this->urls;
319
    }
320
}
321