1
|
|
|
<?php |
2
|
|
|
namespace vipnytt\RobotsTxtParser; |
3
|
|
|
|
4
|
|
|
use vipnytt\RobotsTxtParser\Exceptions\TxtParserException; |
5
|
|
|
|
6
|
|
|
/** |
7
|
|
|
* Class TxtParser |
8
|
|
|
* |
9
|
|
|
* @package vipnytt\RobotsTxtParser |
10
|
|
|
*/ |
11
|
|
|
class TxtParser |
12
|
|
|
{ |
13
|
|
|
/** |
14
|
|
|
* Robots.txt max length in bytes |
15
|
|
|
*/ |
16
|
|
|
const DEFAULT_BYTE_LIMIT = 500000; |
17
|
|
|
|
18
|
|
|
/** |
19
|
|
|
* Max rule length |
20
|
|
|
*/ |
21
|
|
|
const RULE_MAX_LENGTH = 500; |
22
|
|
|
|
23
|
|
|
/** |
24
|
|
|
* Directives |
25
|
|
|
*/ |
26
|
|
|
const DIRECTIVE_ALLOW = 'allow'; |
27
|
|
|
const DIRECTIVE_CACHE_DELAY = 'cache-delay'; // unofficial |
28
|
|
|
const DIRECTIVE_CLEAN_PARAM = 'clean-param'; // Yandex only |
29
|
|
|
const DIRECTIVE_CRAWL_DELAY = 'crawl-delay'; |
30
|
|
|
const DIRECTIVE_DISALLOW = 'disallow'; |
31
|
|
|
const DIRECTIVE_HOST = 'host'; // Yandex only |
32
|
|
|
const DIRECTIVE_SITEMAP = 'sitemap'; |
33
|
|
|
const DIRECTIVE_USER_AGENT = 'user-agent'; |
34
|
|
|
|
35
|
|
|
/** |
36
|
|
|
* Default User-Agent |
37
|
|
|
*/ |
38
|
|
|
const FALLBACK_USER_AGENT = '*'; |
39
|
|
|
|
40
|
|
|
/** |
41
|
|
|
* RAW robots.txt content |
42
|
|
|
* @var string |
43
|
|
|
*/ |
44
|
|
|
private $raw = ''; |
45
|
|
|
|
46
|
|
|
/** |
47
|
|
|
* Rule array |
48
|
|
|
* @var array |
49
|
|
|
*/ |
50
|
|
|
private $rules = []; |
51
|
|
|
|
52
|
|
|
/** |
53
|
|
|
* User-Agents |
54
|
|
|
* @var array |
55
|
|
|
*/ |
56
|
|
|
private $userAgents = [self::FALLBACK_USER_AGENT]; |
57
|
|
|
|
58
|
|
|
/** |
59
|
|
|
* Current line |
60
|
|
|
* @var string |
61
|
|
|
*/ |
62
|
|
|
private $line = ''; |
63
|
|
|
|
64
|
|
|
/** |
65
|
|
|
* Previous directive |
66
|
|
|
* @var string |
67
|
|
|
*/ |
68
|
|
|
private $previous; |
69
|
|
|
|
70
|
|
|
/** |
71
|
|
|
* Current Directive |
72
|
|
|
* @var string |
73
|
|
|
*/ |
74
|
|
|
private $directive; |
75
|
|
|
|
76
|
|
|
/** |
77
|
|
|
* Current Rule |
78
|
|
|
* @var array|string |
79
|
|
|
*/ |
80
|
|
|
private $rule; |
81
|
|
|
|
82
|
|
|
/** |
83
|
|
|
* Constructor |
84
|
|
|
* |
85
|
|
|
* @param string $content - file content |
86
|
|
|
* @param string|null $encoding - character encoding |
87
|
|
|
* @param int|null $byteLimit - maximum of bytes to parse |
88
|
|
|
* @throws TxtParserException |
89
|
|
|
*/ |
90
|
|
|
public function __construct($content, $encoding = null, $byteLimit = self::DEFAULT_BYTE_LIMIT) |
91
|
|
|
{ |
92
|
|
|
if ($encoding === null) { |
93
|
|
|
$encoding = mb_detect_encoding($content); |
94
|
|
|
} |
95
|
|
|
if (!mb_internal_encoding($encoding)) { |
96
|
|
|
throw new TxtParserException('Unable to set internal character encoding to `' . $encoding . '`'); |
97
|
|
|
} |
98
|
|
|
|
99
|
|
|
$this->raw = is_int($byteLimit) ? mb_strcut($content, 0, $byteLimit, $encoding) : $content; |
100
|
|
|
$this->parseTxt(); |
101
|
|
|
} |
102
|
|
|
|
103
|
|
|
/** |
104
|
|
|
* Parse robots.txt |
105
|
|
|
* |
106
|
|
|
* @return void |
107
|
|
|
*/ |
108
|
|
|
private function parseTxt() |
109
|
|
|
{ |
110
|
|
|
$lines = array_filter(array_map('trim', mb_split('\n', $this->raw))); |
111
|
|
|
// Parse each line individually |
112
|
|
|
foreach ($lines as $this->line) { |
113
|
|
|
// Limit rule length and remove comments |
114
|
|
|
$this->line = mb_split('#', mb_substr($this->line, 0, self::RULE_MAX_LENGTH), 2)[0]; |
115
|
|
|
// Parse line |
116
|
|
|
if ( |
117
|
|
|
($this->generateRulePair()) === false |
118
|
|
|
|| ($result = $this->parseLine()) === false |
119
|
|
|
) { |
120
|
|
|
continue; |
121
|
|
|
} |
122
|
|
|
// Add rule |
123
|
|
|
$this->previous = $this->directive; |
124
|
|
|
$this->rule = $result; |
125
|
|
|
$this->rules = array_merge_recursive($this->assignUserAgent(), $this->rules); |
126
|
|
|
} |
127
|
|
|
} |
128
|
|
|
|
129
|
|
|
/** |
130
|
|
|
* Generate Directive:Rule pair |
131
|
|
|
* |
132
|
|
|
* @return bool |
133
|
|
|
*/ |
134
|
|
|
private function generateRulePair() |
135
|
|
|
{ |
136
|
|
|
// Split by directive and rule |
137
|
|
|
$pair = array_map('trim', mb_split(':', $this->line, 2)); |
138
|
|
|
// Check if the line contains a rule |
139
|
|
|
if ( |
140
|
|
|
empty($pair[1]) |
141
|
|
|
|| empty($pair[0]) |
142
|
|
|
|| !in_array(($pair[0] = mb_strtolower($pair[0])), $this->directives()) |
143
|
|
|
) { |
144
|
|
|
// Line does not contain any supported directive |
145
|
|
|
return false; |
146
|
|
|
} |
147
|
|
|
$this->directive = $pair[0]; |
148
|
|
|
$this->rule = $pair[1]; |
149
|
|
|
return true; |
150
|
|
|
} |
151
|
|
|
|
152
|
|
|
/** |
153
|
|
|
* Directives and sub directives |
154
|
|
|
* |
155
|
|
|
* @param string|null $parent |
156
|
|
|
* @return array |
157
|
|
|
*/ |
158
|
|
|
private function directives($parent = null) |
159
|
|
|
{ |
160
|
|
|
$array = [ |
161
|
|
|
self::DIRECTIVE_ALLOW => [ |
162
|
|
|
self::DIRECTIVE_CLEAN_PARAM, |
163
|
|
|
self::DIRECTIVE_HOST, |
164
|
|
|
], |
165
|
|
|
self::DIRECTIVE_CACHE_DELAY => [], |
166
|
|
|
self::DIRECTIVE_CLEAN_PARAM => [], |
167
|
|
|
self::DIRECTIVE_CRAWL_DELAY => [], |
168
|
|
|
self::DIRECTIVE_DISALLOW => [ |
169
|
|
|
self::DIRECTIVE_CLEAN_PARAM, |
170
|
|
|
self::DIRECTIVE_HOST, |
171
|
|
|
], |
172
|
|
|
self::DIRECTIVE_HOST => [], |
173
|
|
|
self::DIRECTIVE_SITEMAP => [], |
174
|
|
|
self::DIRECTIVE_USER_AGENT => [ |
175
|
|
|
self::DIRECTIVE_ALLOW, |
176
|
|
|
self::DIRECTIVE_CACHE_DELAY, |
177
|
|
|
self::DIRECTIVE_CRAWL_DELAY, |
178
|
|
|
self::DIRECTIVE_DISALLOW, |
179
|
|
|
], |
180
|
|
|
]; |
181
|
|
|
if ($parent !== null) { |
182
|
|
|
return isset($array[$parent]) ? $array[$parent] : []; |
183
|
|
|
} |
184
|
|
|
return array_keys($array); |
185
|
|
|
} |
186
|
|
|
|
187
|
|
|
/** |
188
|
|
|
* Parse line |
189
|
|
|
* |
190
|
|
|
* @param string|null $parent |
191
|
|
|
* @return array|false |
192
|
|
|
*/ |
193
|
|
|
private function parseLine($parent = null) |
194
|
|
|
{ |
195
|
|
|
if ( |
196
|
|
|
($this->generateRulePair()) === false |
197
|
|
|
|| !in_array($this->directive, $this->directives($parent)) |
198
|
|
|
) { |
199
|
|
|
return false; |
200
|
|
|
} |
201
|
|
|
// Cache directive/rule variables to after inline directives has been parsed |
202
|
|
|
$directive = $this->directive; |
203
|
|
|
$rule = $this->rule; |
204
|
|
|
$this->line = $this->rule; |
|
|
|
|
205
|
|
|
if (($inline = $this->parseLine($this->directive)) !== false) { |
206
|
|
|
$rule = $inline; |
207
|
|
|
}; |
208
|
|
|
$this->directive = $directive; |
209
|
|
|
$this->rule = $rule; |
210
|
|
|
return $this->add(); |
211
|
|
|
} |
212
|
|
|
|
213
|
|
|
/** |
214
|
|
|
* Add value to directive |
215
|
|
|
* |
216
|
|
|
* @return array|false |
217
|
|
|
*/ |
218
|
|
|
private function add() |
219
|
|
|
{ |
220
|
|
|
switch ($this->directive) { |
221
|
|
|
case self::DIRECTIVE_ALLOW: |
222
|
|
|
case self::DIRECTIVE_DISALLOW: |
223
|
|
|
return $this->addDisAllow(); |
224
|
|
|
case self::DIRECTIVE_CACHE_DELAY: |
225
|
|
|
case self::DIRECTIVE_CRAWL_DELAY: |
226
|
|
|
return $this->addFloat(); |
227
|
|
|
case self::DIRECTIVE_CLEAN_PARAM: |
228
|
|
|
return $this->addCleanParam(); |
229
|
|
|
case self::DIRECTIVE_HOST: |
230
|
|
|
return $this->addHost(); |
231
|
|
|
case self::DIRECTIVE_SITEMAP: |
232
|
|
|
return $this->addSitemap(); |
233
|
|
|
case self::DIRECTIVE_USER_AGENT: |
234
|
|
|
return $this->setUserAgent(); |
235
|
|
|
} |
236
|
|
|
return false; |
237
|
|
|
} |
238
|
|
|
|
239
|
|
|
/** |
240
|
|
|
* Add an Allow or Disallow rule |
241
|
|
|
* |
242
|
|
|
* @return array |
243
|
|
|
*/ |
244
|
|
|
private function addDisAllow() |
245
|
|
|
{ |
246
|
|
|
// If inline directive, pass the array |
247
|
|
|
if (is_array($this->rule)) { |
248
|
|
|
return [ |
249
|
|
|
$this->directive => $this->rule |
250
|
|
|
]; |
251
|
|
|
} |
252
|
|
|
// Return an array of paths |
253
|
|
|
return [ |
254
|
|
|
$this->directive => [ |
255
|
|
|
'path' => [ |
256
|
|
|
$this->rule |
257
|
|
|
] |
258
|
|
|
] |
259
|
|
|
]; |
260
|
|
|
} |
261
|
|
|
|
262
|
|
|
/** |
263
|
|
|
* Add float value |
264
|
|
|
* |
265
|
|
|
* @return array|false |
266
|
|
|
*/ |
267
|
|
|
private function addFloat() |
268
|
|
|
{ |
269
|
|
|
if (empty(($float = floatval($this->rule)))) { |
270
|
|
|
return false; |
271
|
|
|
} |
272
|
|
|
return [ |
273
|
|
|
$this->directive => $float, |
274
|
|
|
]; |
275
|
|
|
} |
276
|
|
|
|
277
|
|
|
/** |
278
|
|
|
* Add Clean-Param record |
279
|
|
|
* |
280
|
|
|
* @return array |
281
|
|
|
*/ |
282
|
|
|
private function addCleanParam() |
283
|
|
|
{ |
284
|
|
|
$result = []; |
285
|
|
|
$cleanParam = $this->explodeCleanParamRule($this->rule); |
|
|
|
|
286
|
|
|
foreach ($cleanParam['param'] as $param) { |
287
|
|
|
$result[$this->directive]['path'][$cleanParam['path']]['param'][] = $param; |
288
|
|
|
} |
289
|
|
|
return $result; |
290
|
|
|
} |
291
|
|
|
|
292
|
|
|
/** |
293
|
|
|
* Explode Clean-Param rule |
294
|
|
|
* |
295
|
|
|
* @param string $rule |
296
|
|
|
* @return array |
297
|
|
|
*/ |
298
|
|
|
private function explodeCleanParamRule($rule) |
299
|
|
|
{ |
300
|
|
|
// split into parameter and path |
301
|
|
|
$array = array_map('trim', mb_split('\s+', $rule, 2)); |
302
|
|
|
$cleanParam = []; |
303
|
|
|
// strip any invalid characters from path prefix |
304
|
|
|
$cleanParam['path'] = isset($array[1]) ? $this->urlEncode(mb_ereg_replace('[^A-Za-z0-9\.-\/\*\_]', '', $array[1])) : "/*"; |
305
|
|
|
$param = array_map('trim', mb_split('&', $array[0])); |
306
|
|
|
foreach ($param as $key) { |
307
|
|
|
$cleanParam['param'][] = $key; |
308
|
|
|
} |
309
|
|
|
return $cleanParam; |
310
|
|
|
} |
311
|
|
|
|
312
|
|
|
/** |
313
|
|
|
* URL encoder according to RFC 3986 |
314
|
|
|
* Returns a string containing the encoded URL with disallowed characters converted to their percentage encodings. |
315
|
|
|
* @link http://publicmind.in/blog/url-encoding/ |
316
|
|
|
* |
317
|
|
|
* @param string $url |
318
|
|
|
* @return string |
319
|
|
|
*/ |
320
|
|
|
private function urlEncode($url) |
321
|
|
|
{ |
322
|
|
|
$reserved = [ |
323
|
|
|
":" => '!%3A!ui', |
324
|
|
|
"/" => '!%2F!ui', |
325
|
|
|
"?" => '!%3F!ui', |
326
|
|
|
"#" => '!%23!ui', |
327
|
|
|
"[" => '!%5B!ui', |
328
|
|
|
"]" => '!%5D!ui', |
329
|
|
|
"@" => '!%40!ui', |
330
|
|
|
"!" => '!%21!ui', |
331
|
|
|
"$" => '!%24!ui', |
332
|
|
|
"&" => '!%26!ui', |
333
|
|
|
"'" => '!%27!ui', |
334
|
|
|
"(" => '!%28!ui', |
335
|
|
|
")" => '!%29!ui', |
336
|
|
|
"*" => '!%2A!ui', |
337
|
|
|
"+" => '!%2B!ui', |
338
|
|
|
"," => '!%2C!ui', |
339
|
|
|
";" => '!%3B!ui', |
340
|
|
|
"=" => '!%3D!ui', |
341
|
|
|
"%" => '!%25!ui' |
342
|
|
|
]; |
343
|
|
|
return preg_replace(array_values($reserved), array_keys($reserved), rawurlencode($url)); |
344
|
|
|
} |
345
|
|
|
|
346
|
|
|
/** |
347
|
|
|
* Add Host |
348
|
|
|
* |
349
|
|
|
* @return array|false |
350
|
|
|
*/ |
351
|
|
|
private function addHost() |
352
|
|
|
{ |
353
|
|
|
if (($parsed = parse_url(($this->rule = $this->urlEncode($this->rule)))) === false) { |
|
|
|
|
354
|
|
|
return false; |
355
|
|
|
} |
356
|
|
|
$host = isset($parsed['host']) ? $parsed['host'] : $parsed['path']; |
357
|
|
|
if ( |
358
|
|
|
!$this->urlValidateHost($host) |
359
|
|
|
|| isset($parsed['scheme']) && !$this->urlValidateScheme($parsed['scheme']) |
360
|
|
|
) { |
361
|
|
|
return false; |
362
|
|
|
} |
363
|
|
|
$scheme = isset($parsed['scheme']) ? $parsed['scheme'] . '://' : ''; |
364
|
|
|
$port = isset($parsed['port']) ? ':' . $parsed['port'] : ''; |
365
|
|
|
return [ |
366
|
|
|
self::DIRECTIVE_HOST => [ |
367
|
|
|
$scheme . $host . $port, |
368
|
|
|
] |
369
|
|
|
]; |
370
|
|
|
} |
371
|
|
|
|
372
|
|
|
/** |
373
|
|
|
* Validate host name |
374
|
|
|
* |
375
|
|
|
* @link http://stackoverflow.com/questions/1755144/how-to-validate-domain-name-in-php |
376
|
|
|
* |
377
|
|
|
* @param string $host |
378
|
|
|
* @return bool |
379
|
|
|
*/ |
380
|
|
|
private static function urlValidateHost($host) |
381
|
|
|
{ |
382
|
|
|
return ( |
383
|
|
|
mb_ereg_match("/^([a-z\d](-*[a-z\d])*)(\.([a-z\d](-*[a-z\d])*))*$/i", $host) //valid chars check |
384
|
|
|
&& mb_ereg_match("/^.{1,253}$/", $host) //overall length check |
385
|
|
|
&& mb_ereg_match("/^[^\.]{1,63}(\.[^\.]{1,63})*$/", $host) //length of each label |
386
|
|
|
&& !filter_var($host, FILTER_VALIDATE_IP) //is not an IP address |
387
|
|
|
); |
388
|
|
|
} |
389
|
|
|
|
390
|
|
|
/** |
391
|
|
|
* Validate URL scheme |
392
|
|
|
* |
393
|
|
|
* @param string $scheme |
394
|
|
|
* @return bool |
395
|
|
|
*/ |
396
|
|
|
private static function urlValidateScheme($scheme) |
397
|
|
|
{ |
398
|
|
|
return in_array($scheme, [ |
399
|
|
|
'http', 'https', |
400
|
|
|
'ftp', 'sftp' |
401
|
|
|
] |
402
|
|
|
); |
403
|
|
|
} |
404
|
|
|
|
405
|
|
|
/** |
406
|
|
|
* Add Sitemap |
407
|
|
|
* |
408
|
|
|
* @return array|false |
409
|
|
|
*/ |
410
|
|
|
private function addSitemap() |
411
|
|
|
{ |
412
|
|
|
if (!$this->urlValidate(($url = $this->urlEncode($this->rule)))) { |
|
|
|
|
413
|
|
|
return false; |
414
|
|
|
} |
415
|
|
|
return [ |
416
|
|
|
self::DIRECTIVE_SITEMAP => [ |
417
|
|
|
$url |
418
|
|
|
] |
419
|
|
|
]; |
420
|
|
|
} |
421
|
|
|
|
422
|
|
|
/** |
423
|
|
|
* Validate URL |
424
|
|
|
* |
425
|
|
|
* @param string $url |
426
|
|
|
* @return bool |
427
|
|
|
*/ |
428
|
|
|
public function urlValidate($url) |
429
|
|
|
{ |
430
|
|
|
return ( |
431
|
|
|
filter_var($url, FILTER_VALIDATE_URL) |
432
|
|
|
&& ($parsed = parse_url($url)) !== false |
433
|
|
|
&& $this->urlValidateHost($parsed['host']) |
434
|
|
|
&& $this->urlValidateScheme($parsed['scheme']) |
435
|
|
|
); |
436
|
|
|
} |
437
|
|
|
|
438
|
|
|
/** |
439
|
|
|
* Set User-Agent(s) |
440
|
|
|
* |
441
|
|
|
* @return array |
442
|
|
|
*/ |
443
|
|
|
private function setUserAgent() |
444
|
|
|
{ |
445
|
|
|
switch ($this->previous) { |
446
|
|
|
case self::DIRECTIVE_USER_AGENT: |
447
|
|
|
$this->userAgents[] = $this->rule; |
448
|
|
|
break; |
449
|
|
|
default: |
450
|
|
|
$this->userAgents = [ |
451
|
|
|
$this->rule |
452
|
|
|
]; |
453
|
|
|
} |
454
|
|
|
return []; |
455
|
|
|
} |
456
|
|
|
|
457
|
|
|
/** |
458
|
|
|
* Assign User-Agent dependent rules to the User-Agent arrays |
459
|
|
|
* |
460
|
|
|
* @return array |
461
|
|
|
*/ |
462
|
|
|
private function assignUserAgent() |
463
|
|
|
{ |
464
|
|
|
if (in_array($this->directive, $this->directives(self::DIRECTIVE_USER_AGENT))) { |
465
|
|
|
$rule = []; |
466
|
|
|
foreach ($this->userAgents as $userAgent) { |
467
|
|
|
$rule[self::DIRECTIVE_USER_AGENT][$userAgent] = $this->rule; |
468
|
|
|
} |
469
|
|
|
return $rule; |
470
|
|
|
} |
471
|
|
|
return $this->rule; |
472
|
|
|
} |
473
|
|
|
|
474
|
|
|
/** |
475
|
|
|
* Get rules |
476
|
|
|
*/ |
477
|
|
|
public function getRules() |
478
|
|
|
{ |
479
|
|
|
return $this->rules; |
480
|
|
|
} |
481
|
|
|
} |
482
|
|
|
|
Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.
For example, imagine you have a variable
$accountId
that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to theid
property of an instance of theAccount
class. This class holds a proper account, so the id value must no longer be false.Either this assignment is in error or a type check should be added for that assignment.