1 | <?php |
||
5 | class RobotsTxt |
||
6 | { |
||
7 | protected static $robotsCache = []; |
||
8 | |||
9 | protected $disallowsPerUserAgent = []; |
||
10 | |||
11 | public static function readFrom(string $source): self |
||
12 | { |
||
13 | $content = @file_get_contents($source); |
||
14 | |||
15 | return new self($content !== false ? $content : ''); |
||
16 | } |
||
17 | |||
18 | public function __construct(string $content) |
||
22 | |||
23 | public static function create(string $source): self |
||
34 | |||
35 | public function allows(string $url, ?string $userAgent = '*'): bool |
||
36 | { |
||
37 | $requestUri = ''; |
||
38 | |||
39 | $parts = parse_url($url); |
||
40 | |||
41 | if ($parts !== false) { |
||
42 | if (isset($parts['path'])) { |
||
43 | $requestUri .= $parts['path']; |
||
44 | } |
||
45 | |||
46 | if (isset($parts['query'])) { |
||
47 | $requestUri .= '?'.$parts['query']; |
||
48 | } elseif ($this->hasEmptyQueryString($url)) { |
||
49 | $requestUri .= '?'; |
||
50 | } |
||
51 | } |
||
52 | |||
53 | $disallows = $this->disallowsPerUserAgent[strtolower(trim($userAgent))] ?? $this->disallowsPerUserAgent['*'] ?? []; |
||
54 | |||
55 | return ! $this->pathIsDenied($requestUri, $disallows); |
||
56 | } |
||
57 | |||
58 | protected function pathIsDenied(string $requestUri, array $disallows): bool |
||
59 | { |
||
60 | foreach ($disallows as $disallow) { |
||
61 | if ($disallow === '') { |
||
62 | continue; |
||
63 | } |
||
64 | |||
65 | $stopAtEndOfString = false; |
||
66 | |||
67 | if ($disallow[-1] === '$') { |
||
68 | // if the pattern ends with a dollar sign, the string must end there |
||
69 | $disallow = substr($disallow, 0, -1); |
||
70 | $stopAtEndOfString = true; |
||
71 | } |
||
72 | |||
73 | // convert to regexp |
||
74 | $disallowRegexp = preg_quote($disallow, '/'); |
||
75 | |||
76 | // the pattern must start at the beginning of the string... |
||
77 | $disallowRegexp = '^'.$disallowRegexp; |
||
78 | |||
79 | // ...and optionally stop at the end of the string |
||
80 | if ($stopAtEndOfString) { |
||
81 | $disallowRegexp .= '$'; |
||
82 | } |
||
83 | |||
84 | // replace (preg_quote'd) stars with an eager match |
||
85 | $disallowRegexp = str_replace('\\*', '.*', $disallowRegexp); |
||
86 | |||
87 | // enclose in delimiters |
||
88 | $disallowRegexp = '/'.$disallowRegexp.'/'; |
||
89 | |||
90 | if (preg_match($disallowRegexp, $requestUri) === 1) { |
||
91 | return true; |
||
92 | } |
||
93 | } |
||
94 | |||
95 | return false; |
||
96 | } |
||
97 | |||
98 | /** |
||
99 | * Checks for an empty query string. |
||
100 | * |
||
101 | * This works around the fact that parse_url() will not set the 'query' key when the query string is empty. |
||
102 | * See: https://bugs.php.net/bug.php?id=78385 |
||
103 | */ |
||
104 | protected function hasEmptyQueryString(string $url): bool |
||
105 | { |
||
106 | if ($url === '') { |
||
107 | return false; |
||
108 | } |
||
109 | |||
110 | if ($url[-1] === '?') { // ends with ? |
||
111 | return true; |
||
112 | } |
||
113 | |||
114 | if (strpos($url, '?#') !== false) { // empty query string, followed by a fragment |
||
115 | return true; |
||
116 | } |
||
117 | |||
118 | return false; |
||
119 | } |
||
120 | |||
121 | protected function getDisallowsPerUserAgent(string $content): array |
||
122 | { |
||
123 | $lines = explode(PHP_EOL, $content); |
||
124 | |||
125 | $lines = array_filter($lines); |
||
126 | |||
127 | $disallowsPerUserAgent = []; |
||
128 | |||
129 | $currentUserAgent = null; |
||
130 | |||
131 | foreach ($lines as $line) { |
||
132 | if ($this->isCommentLine($line)) { |
||
133 | continue; |
||
134 | } |
||
135 | |||
136 | if ($this->isUserAgentLine($line)) { |
||
137 | $disallowsPerUserAgent[$this->parseUserAgent($line)] = []; |
||
138 | |||
139 | $currentUserAgent = &$disallowsPerUserAgent[$this->parseUserAgent($line)]; |
||
140 | |||
141 | continue; |
||
142 | } |
||
143 | |||
144 | if ($currentUserAgent === null) { |
||
145 | continue; |
||
146 | } |
||
147 | |||
148 | $disallowUrl = $this->parseDisallow($line); |
||
149 | |||
150 | $currentUserAgent[$disallowUrl] = $disallowUrl; |
||
151 | } |
||
152 | |||
153 | return $disallowsPerUserAgent; |
||
154 | } |
||
155 | |||
156 | protected function isCommentLine(string $line): bool |
||
157 | { |
||
158 | return strpos(trim($line), '#') === 0; |
||
159 | } |
||
160 | |||
161 | protected function isUserAgentLine(string $line): bool |
||
162 | { |
||
163 | return strpos(trim(strtolower($line)), 'user-agent') === 0; |
||
164 | } |
||
165 | |||
166 | protected function parseUserAgent(string $line): string |
||
167 | { |
||
168 | return trim(str_replace('user-agent', '', strtolower(trim($line))), ': '); |
||
169 | } |
||
170 | |||
171 | protected function parseDisallow(string $line): string |
||
172 | { |
||
173 | return trim(substr_replace(strtolower(trim($line)), '', 0, 8), ': '); |
||
174 | } |
||
175 | |||
176 | /** |
||
177 | * @deprecated |
||
178 | */ |
||
179 | protected function concernsDirectory(string $path): bool |
||
183 | |||
184 | /** |
||
185 | * @deprecated |
||
186 | */ |
||
187 | protected function isUrlInDirectory(string $url, string $path): bool |
||
188 | { |
||
189 | return strpos($url, $path) === 0; |
||
190 | } |
||
191 | } |
||
192 |