Completed
Push — master ( 7e57e4...fda467 )
by Jan-Petter
01:49
created

TxtParser::addHost()   C

Complexity

Conditions 9
Paths 11

Size

Total Lines 23
Code Lines 15

Duplication

Lines 0
Ratio 0 %

Importance

Changes 3
Bugs 1 Features 0
Metric Value
c 3
b 1
f 0
dl 0
loc 23
rs 5.8541
cc 9
eloc 15
nc 11
nop 0
1
<?php
2
namespace vipnytt\RobotsTxtParser;
3
4
use vipnytt\RobotsTxtParser\Exceptions\TxtParserException;
5
6
/**
7
 * Class TxtParser
8
 *
9
 * @package vipnytt\RobotsTxtParser
10
 */
11
class TxtParser
12
{
13
    /**
14
     * Robots.txt max length in bytes
15
     */
16
    const DEFAULT_BYTE_LIMIT = 500000;
17
18
    /**
19
     * Max rule length
20
     */
21
    const RULE_MAX_LENGTH = 500;
22
23
    /**
24
     * Directives
25
     */
26
    const DIRECTIVE_ALLOW = 'allow';
27
    const DIRECTIVE_CACHE_DELAY = 'cache-delay'; // unofficial
28
    const DIRECTIVE_CLEAN_PARAM = 'clean-param'; // Yandex only
29
    const DIRECTIVE_CRAWL_DELAY = 'crawl-delay';
30
    const DIRECTIVE_DISALLOW = 'disallow';
31
    const DIRECTIVE_HOST = 'host';  // Yandex only
32
    const DIRECTIVE_SITEMAP = 'sitemap';
33
    const DIRECTIVE_USER_AGENT = 'user-agent';
34
35
    /**
36
     * Default User-Agent
37
     */
38
    const FALLBACK_USER_AGENT = '*';
39
40
    /**
41
     * RAW robots.txt content
42
     * @var string
43
     */
44
    private $raw = '';
45
46
    /**
47
     * Rule array
48
     * @var array
49
     */
50
    private $rules = [];
51
52
    /**
53
     * User-Agents
54
     * @var array
55
     */
56
    private $userAgents = [self::FALLBACK_USER_AGENT];
57
58
    /**
59
     * Current line
60
     * @var string
61
     */
62
    private $line = '';
63
64
    /**
65
     * Previous directive
66
     * @var string
67
     */
68
    private $previous;
69
70
    /**
71
     * Current Directive
72
     * @var string
73
     */
74
    private $directive;
75
76
    /**
77
     * Current Rule
78
     * @var array|string
79
     */
80
    private $rule;
81
82
    /**
83
     * Constructor
84
     *
85
     * @param string $content - file content
86
     * @param string|null $encoding - character encoding
87
     * @param int|null $byteLimit - maximum of bytes to parse
88
     * @throws TxtParserException
89
     */
90
    public function __construct($content, $encoding = null, $byteLimit = self::DEFAULT_BYTE_LIMIT)
91
    {
92
        if ($encoding === null) {
93
            $encoding = mb_detect_encoding($content);
94
        }
95
        if (!mb_internal_encoding($encoding)) {
96
            throw new TxtParserException('Unable to set internal character encoding to `' . $encoding . '`');
97
        }
98
99
        $this->raw = is_int($byteLimit) ? mb_strcut($content, 0, $byteLimit, $encoding) : $content;
100
        $this->parseTxt();
101
    }
102
103
    /**
104
     * Parse robots.txt
105
     *
106
     * @return void
107
     */
108
    private function parseTxt()
109
    {
110
        $lines = array_filter(array_map('trim', mb_split('\n', $this->raw)));
111
        // Parse each line individually
112
        foreach ($lines as $this->line) {
113
            // Limit rule length and remove comments
114
            $this->line = mb_split('#', mb_substr($this->line, 0, self::RULE_MAX_LENGTH), 2)[0];
115
            // Parse line
116
            if (
117
                ($this->generateRulePair()) === false
118
                || ($result = $this->parseLine()) === false
119
            ) {
120
                continue;
121
            }
122
            // Add rule
123
            $this->previous = $this->directive;
124
            $this->rule = $result;
125
            $this->rules = array_merge_recursive($this->assignUserAgent(), $this->rules);
126
        }
127
    }
128
129
    /**
130
     * Generate Directive:Rule pair
131
     *
132
     * @return bool
133
     */
134
    private function generateRulePair()
135
    {
136
        // Split by directive and rule
137
        $pair = array_map('trim', mb_split(':', $this->line, 2));
138
        // Check if the line contains a rule
139
        if (
140
            empty($pair[1])
141
            || empty($pair[0])
142
            || !in_array(($pair[0] = mb_strtolower($pair[0])), $this->directives())
143
        ) {
144
            // Line does not contain any supported directive
145
            return false;
146
        }
147
        $this->directive = $pair[0];
148
        $this->rule = $pair[1];
149
        return true;
150
    }
151
152
    /**
153
     * Directives and sub directives
154
     *
155
     * @param string|null $parent
156
     * @return array
157
     */
158
    private function directives($parent = null)
159
    {
160
        $array = [
161
            self::DIRECTIVE_ALLOW => [
162
                self::DIRECTIVE_CLEAN_PARAM,
163
                self::DIRECTIVE_HOST,
164
            ],
165
            self::DIRECTIVE_CACHE_DELAY => [],
166
            self::DIRECTIVE_CLEAN_PARAM => [],
167
            self::DIRECTIVE_CRAWL_DELAY => [],
168
            self::DIRECTIVE_DISALLOW => [
169
                self::DIRECTIVE_CLEAN_PARAM,
170
                self::DIRECTIVE_HOST,
171
            ],
172
            self::DIRECTIVE_HOST => [],
173
            self::DIRECTIVE_SITEMAP => [],
174
            self::DIRECTIVE_USER_AGENT => [
175
                self::DIRECTIVE_ALLOW,
176
                self::DIRECTIVE_CACHE_DELAY,
177
                self::DIRECTIVE_CRAWL_DELAY,
178
                self::DIRECTIVE_DISALLOW,
179
            ],
180
        ];
181
        if ($parent !== null) {
182
            return isset($array[$parent]) ? $array[$parent] : [];
183
        }
184
        return array_keys($array);
185
    }
186
187
    /**
188
     * Parse line
189
     *
190
     * @param string|null $parent
191
     * @return array|false
192
     */
193
    private function parseLine($parent = null)
194
    {
195
        if (
196
            ($this->generateRulePair()) === false
197
            || !in_array($this->directive, $this->directives($parent))
198
        ) {
199
            return false;
200
        }
201
        // Cache directive/rule variables to after inline directives has been parsed
202
        $directive = $this->directive;
203
        $rule = $this->rule;
204
        $this->line = (string)$this->rule;
205
        if (($inline = $this->parseLine($this->directive)) !== false) {
206
            $rule = $inline;
207
        };
208
        $this->directive = $directive;
209
        $this->rule = $rule;
210
        return $this->add();
211
    }
212
213
    /**
214
     * Add value to directive
215
     *
216
     * @return array|false
217
     */
218
    private function add()
219
    {
220
        switch ($this->directive) {
221
            case self::DIRECTIVE_ALLOW:
222
            case self::DIRECTIVE_DISALLOW:
223
                return $this->addDisAllow();
224
            case self::DIRECTIVE_CACHE_DELAY:
225
            case self::DIRECTIVE_CRAWL_DELAY:
226
                return $this->addFloat();
227
            case self::DIRECTIVE_CLEAN_PARAM:
228
                return $this->addCleanParam();
229
            case self::DIRECTIVE_HOST:
230
                return $this->addHost();
231
            case self::DIRECTIVE_SITEMAP:
232
                return $this->addSitemap();
233
            case self::DIRECTIVE_USER_AGENT:
234
                return $this->setUserAgent();
235
        }
236
        return false;
237
    }
238
239
    /**
240
     * Add an Allow or Disallow rule
241
     *
242
     * @return array
243
     */
244
    private function addDisAllow()
245
    {
246
        // If inline directive, pass the array
247
        if (is_array($this->rule)) {
248
            return [
249
                $this->directive => $this->rule
250
            ];
251
        }
252
        // Return an array of paths
253
        return [
254
            $this->directive => [
255
                'path' => [
256
                    $this->rule
257
                ]
258
            ]
259
        ];
260
    }
261
262
    /**
263
     * Add float value
264
     *
265
     * @return array|false
266
     */
267
    private function addFloat()
268
    {
269
        if (empty(($float = floatval($this->rule)))) {
270
            return false;
271
        }
272
        return [
273
            $this->directive => $float,
274
        ];
275
    }
276
277
    /**
278
     * Add Clean-Param record
279
     *
280
     * @return array|false
281
     */
282
    private function addCleanParam()
283
    {
284
        if (!is_string($this->rule)) {
285
            return false;
286
        }
287
        $result = [];
288
        $cleanParam = $this->explodeCleanParamRule($this->rule);
289
        foreach ($cleanParam['param'] as $param) {
290
            $result[$this->directive]['path'][$cleanParam['path']]['param'][] = $param;
291
        }
292
        return $result;
293
    }
294
295
    /**
296
     * Explode Clean-Param rule
297
     *
298
     * @param  string $rule
299
     * @return array
300
     */
301
    private function explodeCleanParamRule($rule)
302
    {
303
        // split into parameter and path
304
        $array = array_map('trim', mb_split('\s+', $rule, 2));
305
        $cleanParam = [];
306
        // strip any invalid characters from path prefix
307
        $cleanParam['path'] = isset($array[1]) ? $this->urlEncode(mb_ereg_replace('[^A-Za-z0-9\.-\/\*\_]', '', $array[1])) : "/*";
308
        $param = array_map('trim', mb_split('&', $array[0]));
309
        foreach ($param as $key) {
310
            $cleanParam['param'][] = $key;
311
        }
312
        return $cleanParam;
313
    }
314
315
    /**
316
     * URL encoder according to RFC 3986
317
     * Returns a string containing the encoded URL with disallowed characters converted to their percentage encodings.
318
     * @link http://publicmind.in/blog/url-encoding/
319
     *
320
     * @param string $url
321
     * @return string
322
     */
323
    private function urlEncode($url)
324
    {
325
        $reserved = [
326
            ":" => '!%3A!ui',
327
            "/" => '!%2F!ui',
328
            "?" => '!%3F!ui',
329
            "#" => '!%23!ui',
330
            "[" => '!%5B!ui',
331
            "]" => '!%5D!ui',
332
            "@" => '!%40!ui',
333
            "!" => '!%21!ui',
334
            "$" => '!%24!ui',
335
            "&" => '!%26!ui',
336
            "'" => '!%27!ui',
337
            "(" => '!%28!ui',
338
            ")" => '!%29!ui',
339
            "*" => '!%2A!ui',
340
            "+" => '!%2B!ui',
341
            "," => '!%2C!ui',
342
            ";" => '!%3B!ui',
343
            "=" => '!%3D!ui',
344
            "%" => '!%25!ui'
345
        ];
346
        return preg_replace(array_values($reserved), array_keys($reserved), rawurlencode($url));
347
    }
348
349
    /**
350
     * Add Host
351
     *
352
     * @return array|false
353
     */
354
    private function addHost()
355
    {
356
        if (
357
            !is_string($this->rule)
358
            || ($parsed = parse_url(($this->rule = $this->urlEncode($this->rule)))) === false
359
        ) {
360
            return false;
361
        }
362
        $host = isset($parsed['host']) ? $parsed['host'] : $parsed['path'];
363
        if (
364
            !$this->urlValidateHost($host)
365
            || isset($parsed['scheme']) && !$this->urlValidateScheme($parsed['scheme'])
366
        ) {
367
            return false;
368
        }
369
        $scheme = isset($parsed['scheme']) ? $parsed['scheme'] . '://' : '';
370
        $port = isset($parsed['port']) ? ':' . $parsed['port'] : '';
371
        return [
372
            self::DIRECTIVE_HOST => [
373
                $scheme . $host . $port,
374
            ]
375
        ];
376
    }
377
378
    /**
379
     * Validate host name
380
     *
381
     * @link http://stackoverflow.com/questions/1755144/how-to-validate-domain-name-in-php
382
     *
383
     * @param  string $host
384
     * @return bool
385
     */
386
    private static function  urlValidateHost($host)
387
    {
388
        return (
389
            mb_ereg_match("/^([a-z\d](-*[a-z\d])*)(\.([a-z\d](-*[a-z\d])*))*$/i", $host) //valid chars check
390
            && mb_ereg_match("/^.{1,253}$/", $host) //overall length check
391
            && mb_ereg_match("/^[^\.]{1,63}(\.[^\.]{1,63})*$/", $host) //length of each label
392
            && !filter_var($host, FILTER_VALIDATE_IP) //is not an IP address
393
        );
394
    }
395
396
    /**
397
     * Validate URL scheme
398
     *
399
     * @param  string $scheme
400
     * @return bool
401
     */
402
    private static function urlValidateScheme($scheme)
403
    {
404
        return in_array($scheme, [
405
                'http', 'https',
406
                'ftp', 'sftp'
407
            ]
408
        );
409
    }
410
411
    /**
412
     * Add Sitemap
413
     *
414
     * @return array|false
415
     */
416
    private function addSitemap()
417
    {
418
        if (
419
            !is_string($this->rule)
420
            || !$this->urlValidate(($url = $this->urlEncode($this->rule)))
421
        ) {
422
            return false;
423
        }
424
        return [
425
            self::DIRECTIVE_SITEMAP => [
426
                $url
427
            ]
428
        ];
429
    }
430
431
    /**
432
     * Validate URL
433
     *
434
     * @param string $url
435
     * @return bool
436
     */
437
    public function urlValidate($url)
438
    {
439
        return (
440
            filter_var($url, FILTER_VALIDATE_URL)
441
            && ($parsed = parse_url($url)) !== false
442
            && $this->urlValidateHost($parsed['host'])
443
            && $this->urlValidateScheme($parsed['scheme'])
444
        );
445
    }
446
447
    /**
448
     * Set User-Agent(s)
449
     *
450
     * @return array
451
     */
452
    private function setUserAgent()
453
    {
454
        switch ($this->previous) {
455
            case self::DIRECTIVE_USER_AGENT:
456
                $this->userAgents[] = $this->rule;
457
                break;
458
            default:
459
                $this->userAgents = [
460
                    $this->rule
461
                ];
462
        }
463
        return [];
464
    }
465
466
    /**
467
     * Assign User-Agent dependent rules to the User-Agent arrays
468
     *
469
     * @return array
470
     */
471
    private function assignUserAgent()
472
    {
473
        if (in_array($this->directive, $this->directives(self::DIRECTIVE_USER_AGENT))) {
474
            $rule = [];
475
            foreach ($this->userAgents as $userAgent) {
476
                $rule[self::DIRECTIVE_USER_AGENT][$userAgent] = $this->rule;
477
            }
478
            return $rule;
479
        }
480
        return $this->rule;
481
    }
482
483
    /**
484
     * Get rules
485
     */
486
    public function getRules()
487
    {
488
        return $this->rules;
489
    }
490
}
491