Completed
Push — master ( d1d688...7e57e4 )
by Jan-Petter
01:56
created

TxtParser::addHost()   B

Complexity

Conditions 8
Paths 11

Size

Total Lines 20
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
c 2
b 0
f 0
dl 0
loc 20
rs 7.7777
cc 8
eloc 13
nc 11
nop 0
1
<?php
2
namespace vipnytt\RobotsTxtParser;
3
4
use vipnytt\RobotsTxtParser\Exceptions\TxtParserException;
5
6
/**
7
 * Class TxtParser
8
 *
9
 * @package vipnytt\RobotsTxtParser
10
 */
11
class TxtParser
12
{
13
    /**
14
     * Robots.txt max length in bytes
15
     */
16
    const DEFAULT_BYTE_LIMIT = 500000;
17
18
    /**
19
     * Max rule length
20
     */
21
    const RULE_MAX_LENGTH = 500;
22
23
    /**
24
     * Directives
25
     */
26
    const DIRECTIVE_ALLOW = 'allow';
27
    const DIRECTIVE_CACHE_DELAY = 'cache-delay'; // unofficial
28
    const DIRECTIVE_CLEAN_PARAM = 'clean-param'; // Yandex only
29
    const DIRECTIVE_CRAWL_DELAY = 'crawl-delay';
30
    const DIRECTIVE_DISALLOW = 'disallow';
31
    const DIRECTIVE_HOST = 'host';  // Yandex only
32
    const DIRECTIVE_SITEMAP = 'sitemap';
33
    const DIRECTIVE_USER_AGENT = 'user-agent';
34
35
    /**
36
     * Default User-Agent
37
     */
38
    const FALLBACK_USER_AGENT = '*';
39
40
    /**
41
     * RAW robots.txt content
42
     * @var string
43
     */
44
    private $raw = '';
45
46
    /**
47
     * Rule array
48
     * @var array
49
     */
50
    private $rules = [];
51
52
    /**
53
     * User-Agents
54
     * @var array
55
     */
56
    private $userAgents = [self::FALLBACK_USER_AGENT];
57
58
    /**
59
     * Current line
60
     * @var string
61
     */
62
    private $line = '';
63
64
    /**
65
     * Previous directive
66
     * @var string
67
     */
68
    private $previous;
69
70
    /**
71
     * Current Directive
72
     * @var string
73
     */
74
    private $directive;
75
76
    /**
77
     * Current Rule
78
     * @var array|string
79
     */
80
    private $rule;
81
82
    /**
83
     * Constructor
84
     *
85
     * @param string $content - file content
86
     * @param string|null $encoding - character encoding
87
     * @param int|null $byteLimit - maximum of bytes to parse
88
     * @throws TxtParserException
89
     */
90
    public function __construct($content, $encoding = null, $byteLimit = self::DEFAULT_BYTE_LIMIT)
91
    {
92
        if ($encoding === null) {
93
            $encoding = mb_detect_encoding($content);
94
        }
95
        if (!mb_internal_encoding($encoding)) {
96
            throw new TxtParserException('Unable to set internal character encoding to `' . $encoding . '`');
97
        }
98
99
        $this->raw = is_int($byteLimit) ? mb_strcut($content, 0, $byteLimit, $encoding) : $content;
100
        $this->parseTxt();
101
    }
102
103
    /**
104
     * Parse robots.txt
105
     *
106
     * @return void
107
     */
108
    private function parseTxt()
109
    {
110
        $lines = array_filter(array_map('trim', mb_split('\n', $this->raw)));
111
        // Parse each line individually
112
        foreach ($lines as $this->line) {
113
            // Limit rule length and remove comments
114
            $this->line = mb_split('#', mb_substr($this->line, 0, self::RULE_MAX_LENGTH), 2)[0];
115
            // Parse line
116
            if (
117
                ($this->generateRulePair()) === false
118
                || ($result = $this->parseLine()) === false
119
            ) {
120
                continue;
121
            }
122
            // Add rule
123
            $this->previous = $this->directive;
124
            $this->rule = $result;
125
            $this->rules = array_merge_recursive($this->assignUserAgent(), $this->rules);
126
        }
127
    }
128
129
    /**
130
     * Generate Directive:Rule pair
131
     *
132
     * @return bool
133
     */
134
    private function generateRulePair()
135
    {
136
        // Split by directive and rule
137
        $pair = array_map('trim', mb_split(':', $this->line, 2));
138
        // Check if the line contains a rule
139
        if (
140
            empty($pair[1])
141
            || empty($pair[0])
142
            || !in_array(($pair[0] = mb_strtolower($pair[0])), $this->directives())
143
        ) {
144
            // Line does not contain any supported directive
145
            return false;
146
        }
147
        $this->directive = $pair[0];
148
        $this->rule = $pair[1];
149
        return true;
150
    }
151
152
    /**
153
     * Directives and sub directives
154
     *
155
     * @param string|null $parent
156
     * @return array
157
     */
158
    private function directives($parent = null)
159
    {
160
        $array = [
161
            self::DIRECTIVE_ALLOW => [
162
                self::DIRECTIVE_CLEAN_PARAM,
163
                self::DIRECTIVE_HOST,
164
            ],
165
            self::DIRECTIVE_CACHE_DELAY => [],
166
            self::DIRECTIVE_CLEAN_PARAM => [],
167
            self::DIRECTIVE_CRAWL_DELAY => [],
168
            self::DIRECTIVE_DISALLOW => [
169
                self::DIRECTIVE_CLEAN_PARAM,
170
                self::DIRECTIVE_HOST,
171
            ],
172
            self::DIRECTIVE_HOST => [],
173
            self::DIRECTIVE_SITEMAP => [],
174
            self::DIRECTIVE_USER_AGENT => [
175
                self::DIRECTIVE_ALLOW,
176
                self::DIRECTIVE_CACHE_DELAY,
177
                self::DIRECTIVE_CRAWL_DELAY,
178
                self::DIRECTIVE_DISALLOW,
179
            ],
180
        ];
181
        if ($parent !== null) {
182
            return isset($array[$parent]) ? $array[$parent] : [];
183
        }
184
        return array_keys($array);
185
    }
186
187
    /**
188
     * Parse line
189
     *
190
     * @param string|null $parent
191
     * @return array|false
192
     */
193
    private function parseLine($parent = null)
194
    {
195
        if (
196
            ($this->generateRulePair()) === false
197
            || !in_array($this->directive, $this->directives($parent))
198
        ) {
199
            return false;
200
        }
201
        // Cache directive/rule variables to after inline directives has been parsed
202
        $directive = $this->directive;
203
        $rule = $this->rule;
204
        $this->line = $this->rule;
0 ignored issues
show
Documentation Bug introduced by
It seems like $this->rule can also be of type array. However, the property $line is declared as type string. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
205
        if (($inline = $this->parseLine($this->directive)) !== false) {
206
            $rule = $inline;
207
        };
208
        $this->directive = $directive;
209
        $this->rule = $rule;
210
        return $this->add();
211
    }
212
213
    /**
214
     * Add value to directive
215
     *
216
     * @return array|false
217
     */
218
    private function add()
219
    {
220
        switch ($this->directive) {
221
            case self::DIRECTIVE_ALLOW:
222
            case self::DIRECTIVE_DISALLOW:
223
                return $this->addDisAllow();
224
            case self::DIRECTIVE_CACHE_DELAY:
225
            case self::DIRECTIVE_CRAWL_DELAY:
226
                return $this->addFloat();
227
            case self::DIRECTIVE_CLEAN_PARAM:
228
                return $this->addCleanParam();
229
            case self::DIRECTIVE_HOST:
230
                return $this->addHost();
231
            case self::DIRECTIVE_SITEMAP:
232
                return $this->addSitemap();
233
            case self::DIRECTIVE_USER_AGENT:
234
                return $this->setUserAgent();
235
        }
236
        return false;
237
    }
238
239
    /**
240
     * Add an Allow or Disallow rule
241
     *
242
     * @return array
243
     */
244
    private function addDisAllow()
245
    {
246
        // If inline directive, pass the array
247
        if (is_array($this->rule)) {
248
            return [
249
                $this->directive => $this->rule
250
            ];
251
        }
252
        // Return an array of paths
253
        return [
254
            $this->directive => [
255
                'path' => [
256
                    $this->rule
257
                ]
258
            ]
259
        ];
260
    }
261
262
    /**
263
     * Add float value
264
     *
265
     * @return array|false
266
     */
267
    private function addFloat()
268
    {
269
        if (empty(($float = floatval($this->rule)))) {
270
            return false;
271
        }
272
        return [
273
            $this->directive => $float,
274
        ];
275
    }
276
277
    /**
278
     * Add Clean-Param record
279
     *
280
     * @return array
281
     */
282
    private function addCleanParam()
283
    {
284
        $result = [];
285
        $cleanParam = $this->explodeCleanParamRule($this->rule);
0 ignored issues
show
Bug introduced by
It seems like $this->rule can also be of type array; however, vipnytt\RobotsTxtParser\...explodeCleanParamRule() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
286
        foreach ($cleanParam['param'] as $param) {
287
            $result[$this->directive]['path'][$cleanParam['path']]['param'][] = $param;
288
        }
289
        return $result;
290
    }
291
292
    /**
293
     * Explode Clean-Param rule
294
     *
295
     * @param  string $rule
296
     * @return array
297
     */
298
    private function explodeCleanParamRule($rule)
299
    {
300
        // split into parameter and path
301
        $array = array_map('trim', mb_split('\s+', $rule, 2));
302
        $cleanParam = [];
303
        // strip any invalid characters from path prefix
304
        $cleanParam['path'] = isset($array[1]) ? $this->urlEncode(mb_ereg_replace('[^A-Za-z0-9\.-\/\*\_]', '', $array[1])) : "/*";
305
        $param = array_map('trim', mb_split('&', $array[0]));
306
        foreach ($param as $key) {
307
            $cleanParam['param'][] = $key;
308
        }
309
        return $cleanParam;
310
    }
311
312
    /**
313
     * URL encoder according to RFC 3986
314
     * Returns a string containing the encoded URL with disallowed characters converted to their percentage encodings.
315
     * @link http://publicmind.in/blog/url-encoding/
316
     *
317
     * @param string $url
318
     * @return string
319
     */
320
    private function urlEncode($url)
321
    {
322
        $reserved = [
323
            ":" => '!%3A!ui',
324
            "/" => '!%2F!ui',
325
            "?" => '!%3F!ui',
326
            "#" => '!%23!ui',
327
            "[" => '!%5B!ui',
328
            "]" => '!%5D!ui',
329
            "@" => '!%40!ui',
330
            "!" => '!%21!ui',
331
            "$" => '!%24!ui',
332
            "&" => '!%26!ui',
333
            "'" => '!%27!ui',
334
            "(" => '!%28!ui',
335
            ")" => '!%29!ui',
336
            "*" => '!%2A!ui',
337
            "+" => '!%2B!ui',
338
            "," => '!%2C!ui',
339
            ";" => '!%3B!ui',
340
            "=" => '!%3D!ui',
341
            "%" => '!%25!ui'
342
        ];
343
        return preg_replace(array_values($reserved), array_keys($reserved), rawurlencode($url));
344
    }
345
346
    /**
347
     * Add Host
348
     *
349
     * @return array|false
350
     */
351
    private function addHost()
352
    {
353
        if (($parsed = parse_url(($this->rule = $this->urlEncode($this->rule)))) === false) {
0 ignored issues
show
Bug introduced by
It seems like $this->rule can also be of type array; however, vipnytt\RobotsTxtParser\TxtParser::urlEncode() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
354
            return false;
355
        }
356
        $host = isset($parsed['host']) ? $parsed['host'] : $parsed['path'];
357
        if (
358
            !$this->urlValidateHost($host)
359
            || isset($parsed['scheme']) && !$this->urlValidateScheme($parsed['scheme'])
360
        ) {
361
            return false;
362
        }
363
        $scheme = isset($parsed['scheme']) ? $parsed['scheme'] . '://' : '';
364
        $port = isset($parsed['port']) ? ':' . $parsed['port'] : '';
365
        return [
366
            self::DIRECTIVE_HOST => [
367
                $scheme . $host . $port,
368
            ]
369
        ];
370
    }
371
372
    /**
373
     * Validate host name
374
     *
375
     * @link http://stackoverflow.com/questions/1755144/how-to-validate-domain-name-in-php
376
     *
377
     * @param  string $host
378
     * @return bool
379
     */
380
    private static function  urlValidateHost($host)
381
    {
382
        return (
383
            mb_ereg_match("/^([a-z\d](-*[a-z\d])*)(\.([a-z\d](-*[a-z\d])*))*$/i", $host) //valid chars check
384
            && mb_ereg_match("/^.{1,253}$/", $host) //overall length check
385
            && mb_ereg_match("/^[^\.]{1,63}(\.[^\.]{1,63})*$/", $host) //length of each label
386
            && !filter_var($host, FILTER_VALIDATE_IP) //is not an IP address
387
        );
388
    }
389
390
    /**
391
     * Validate URL scheme
392
     *
393
     * @param  string $scheme
394
     * @return bool
395
     */
396
    private static function urlValidateScheme($scheme)
397
    {
398
        return in_array($scheme, [
399
                'http', 'https',
400
                'ftp', 'sftp'
401
            ]
402
        );
403
    }
404
405
    /**
406
     * Add Sitemap
407
     *
408
     * @return array|false
409
     */
410
    private function addSitemap()
411
    {
412
        if (!$this->urlValidate(($url = $this->urlEncode($this->rule)))) {
0 ignored issues
show
Bug introduced by
It seems like $this->rule can also be of type array; however, vipnytt\RobotsTxtParser\TxtParser::urlEncode() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
413
            return false;
414
        }
415
        return [
416
            self::DIRECTIVE_SITEMAP => [
417
                $url
418
            ]
419
        ];
420
    }
421
422
    /**
423
     * Validate URL
424
     *
425
     * @param string $url
426
     * @return bool
427
     */
428
    public function urlValidate($url)
429
    {
430
        return (
431
            filter_var($url, FILTER_VALIDATE_URL)
432
            && ($parsed = parse_url($url)) !== false
433
            && $this->urlValidateHost($parsed['host'])
434
            && $this->urlValidateScheme($parsed['scheme'])
435
        );
436
    }
437
438
    /**
439
     * Set User-Agent(s)
440
     *
441
     * @return array
442
     */
443
    private function setUserAgent()
444
    {
445
        switch ($this->previous) {
446
            case self::DIRECTIVE_USER_AGENT:
447
                $this->userAgents[] = $this->rule;
448
                break;
449
            default:
450
                $this->userAgents = [
451
                    $this->rule
452
                ];
453
        }
454
        return [];
455
    }
456
457
    /**
458
     * Assign User-Agent dependent rules to the User-Agent arrays
459
     *
460
     * @return array
461
     */
462
    private function assignUserAgent()
463
    {
464
        if (in_array($this->directive, $this->directives(self::DIRECTIVE_USER_AGENT))) {
465
            $rule = [];
466
            foreach ($this->userAgents as $userAgent) {
467
                $rule[self::DIRECTIVE_USER_AGENT][$userAgent] = $this->rule;
468
            }
469
            return $rule;
470
        }
471
        return $this->rule;
472
    }
473
474
    /**
475
     * Get rules
476
     */
477
    public function getRules()
478
    {
479
        return $this->rules;
480
    }
481
}
482