1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/* |
4
|
|
|
* This file is part of the bisarca/robots-txt package. |
5
|
|
|
* |
6
|
|
|
* (c) Emanuele Minotto <[email protected]> |
7
|
|
|
* |
8
|
|
|
* For the full copyright and license information, please view the LICENSE |
9
|
|
|
* file that was distributed with this source code. |
10
|
|
|
*/ |
11
|
|
|
|
12
|
|
|
namespace Bisarca\RobotsTxt; |
13
|
|
|
|
14
|
|
|
use Bisarca\RobotsTxt\Directive\DirectiveInterface; |
15
|
|
|
use Bisarca\RobotsTxt\Directive\NonGroupInterface; |
16
|
|
|
use Bisarca\RobotsTxt\Directive\StartOfGroupInterface; |
17
|
|
|
use Bisarca\RobotsTxt\Exception\ExceptionInterface; |
18
|
|
|
use Bisarca\RobotsTxt\Exception\MissingDirectiveException; |
19
|
|
|
|
20
|
|
|
/** |
21
|
|
|
* Robots.txt file parser. |
22
|
|
|
*/ |
23
|
|
|
class Parser |
24
|
|
|
{ |
25
|
|
|
/** |
26
|
|
|
* Registered directives. |
27
|
|
|
* |
28
|
|
|
* @var string[] |
29
|
|
|
*/ |
30
|
|
|
const DIRECTIVES = [ |
31
|
|
|
'allow' => Directive\Allow::class, |
32
|
|
|
'comment' => Directive\Comment::class, |
33
|
|
|
'disallow' => Directive\Disallow::class, |
34
|
|
|
'host' => Directive\Host::class, |
35
|
|
|
'sitemap' => Directive\Sitemap::class, |
36
|
|
|
'user-agent' => Directive\UserAgent::class, |
37
|
|
|
]; |
38
|
|
|
|
39
|
|
|
/** |
40
|
|
|
* Parse robots.txt content. |
41
|
|
|
* |
42
|
|
|
* @param string $content |
43
|
|
|
* |
44
|
|
|
* @return Rulesets |
45
|
|
|
*/ |
46
|
|
|
public function parse(string $content): Rulesets |
47
|
|
|
{ |
48
|
|
|
$rows = $this->extractRows($content); |
49
|
|
|
$groups = []; |
50
|
|
|
|
51
|
|
|
$counter = -1; |
52
|
|
|
$type = false; |
53
|
|
|
|
54
|
|
|
foreach ($rows as $row) { |
55
|
|
|
try { |
56
|
|
|
$directive = $this->createDirective($row); |
57
|
|
|
} catch (ExceptionInterface $exception) { |
58
|
|
|
continue; |
59
|
|
|
} |
60
|
|
|
|
61
|
|
|
$previous = $type; |
62
|
|
|
$type = $directive instanceof StartOfGroupInterface; |
63
|
|
|
|
64
|
|
|
if ( |
65
|
|
|
$directive instanceof NonGroupInterface || |
66
|
|
|
( |
67
|
|
|
$type && |
68
|
|
|
!($type && $previous) |
69
|
|
|
) |
70
|
|
|
) { |
71
|
|
|
++$counter; |
72
|
|
|
} |
73
|
|
|
|
74
|
|
|
if (!isset($groups[$counter])) { |
75
|
|
|
$groups[$counter] = []; |
76
|
|
|
} |
77
|
|
|
|
78
|
|
|
$groups[$counter][] = $directive; |
79
|
|
|
} |
80
|
|
|
|
81
|
|
|
// any group-member records without a preceding |
82
|
|
|
// start-of-group record are ignored |
83
|
|
|
unset($groups[-1]); |
84
|
|
|
|
85
|
|
|
foreach ($groups as $index => $group) { |
86
|
|
|
$groups[$index] = new Ruleset(...$group); |
87
|
|
|
} |
88
|
|
|
|
89
|
|
|
return new Rulesets(...$groups); |
90
|
|
|
} |
91
|
|
|
|
92
|
|
|
/** |
93
|
|
|
* Extract single rows from main content. |
94
|
|
|
* |
95
|
|
|
* @param string $content |
96
|
|
|
* |
97
|
|
|
* @return array |
98
|
|
|
*/ |
99
|
|
|
private function extractRows(string $content): array |
100
|
|
|
{ |
101
|
|
|
// split by EOL |
102
|
|
|
$rows = explode(PHP_EOL, $content); |
103
|
|
|
|
104
|
|
|
// remove comments and wrapper spaces |
105
|
|
|
$rows = array_map(function ($row) { |
106
|
|
|
return trim(preg_replace('/^(.*)#.*/', '$1', $row)); |
107
|
|
|
}, $rows); |
108
|
|
|
|
109
|
|
|
// empty lines aren't useful |
110
|
|
|
return array_filter($rows); |
111
|
|
|
} |
112
|
|
|
|
113
|
|
|
/** |
114
|
|
|
* Creates a directive from the raw line contained in the robots.txt file. |
115
|
|
|
* |
116
|
|
|
* @param string $row Raw line |
117
|
|
|
* |
118
|
|
|
* @throws MissingDirectiveException If no directive is available |
119
|
|
|
* |
120
|
|
|
* @return DirectiveInterface |
121
|
|
|
*/ |
122
|
|
|
private function createDirective(string $row): DirectiveInterface |
123
|
|
|
{ |
124
|
|
|
$directives = array_filter( |
125
|
|
|
self::DIRECTIVES, |
126
|
|
|
function ($field) use ($row) { |
127
|
|
|
return preg_match(sprintf('/^%s:\s+.+/i', $field), $row); |
128
|
|
|
}, |
129
|
|
|
ARRAY_FILTER_USE_KEY |
130
|
|
|
); |
131
|
|
|
$directives = array_values($directives); |
132
|
|
|
|
133
|
|
|
// no directives found for this row |
134
|
|
|
// no action required |
135
|
|
|
if (empty($directives)) { |
136
|
|
|
throw MissingDirectiveException::create($row); |
137
|
|
|
} |
138
|
|
|
|
139
|
|
|
// directives should be sorted by priority |
140
|
|
|
return new $directives[0]($row); |
141
|
|
|
} |
142
|
|
|
} |
143
|
|
|
|