1 | <?php |
||
5 | class RobotsTxt |
||
6 | { |
||
7 | protected static $robotsCache = []; |
||
8 | |||
9 | protected $disallowsPerUserAgent = []; |
||
10 | |||
11 | public static function readFrom(string $source): self |
||
17 | |||
18 | public function __construct(string $content) |
||
22 | |||
23 | public static function create(string $source): self |
||
34 | |||
35 | public function allows(string $url, ?string $userAgent = '*'): bool |
||
36 | { |
||
37 | $userAgent = strtolower($userAgent); |
||
38 | |||
39 | if ($userAgent === null) { |
||
40 | $userAgent = '*'; |
||
41 | } |
||
42 | |||
43 | $path = parse_url($url, PHP_URL_PATH) ?? ''; |
||
44 | |||
45 | $disallows = $this->disallowsPerUserAgent[$userAgent] ?? $this->disallowsPerUserAgent['*'] ?? []; |
||
46 | |||
47 | // check for exact |
||
48 | $type = null; |
||
49 | if ($return = $this->pathIsDenied($path, $disallows, $type) !== null) { |
||
50 | |||
51 | // if it's in a dir, maybe wildcard authorize or forbid it |
||
52 | if ($type === 1 && $wildCardReturn = $this->checkForWildcard($url, $userAgent) !== null) { |
||
53 | return $wildCardReturn; |
||
54 | } |
||
55 | |||
56 | // if it's in a dir but wildcard forbid access |
||
57 | if ($type === 1 && $wildCardReturn = $this->pathIsDenied($path, $disallows) !== null) { |
||
58 | if ($wildCardReturn === true) { |
||
59 | return false; |
||
60 | } |
||
61 | } |
||
62 | |||
63 | return ! $return; |
||
64 | } |
||
65 | |||
66 | // check for wildcrad user agent |
||
67 | if ($return = $this->checkForWildcard($url, $userAgent) !== null) { |
||
68 | return ! $return; |
||
69 | } |
||
70 | |||
71 | // check for wildcard |
||
72 | if ($return = $this->pathIsDenied($path, $disallows) !== null) { |
||
73 | return ! $return; |
||
74 | } |
||
75 | |||
76 | return true; |
||
77 | } |
||
78 | |||
79 | protected function checkForWildcard(string $path, string $userAgent) |
||
80 | { |
||
81 | if ($userAgent !== '*') { |
||
82 | for ($i = 1; $i <= strlen($userAgent); $i++) { |
||
83 | $wildCardUserAgent = substr($userAgent, 0, $i).'*'; |
||
84 | if (isset($this->disallowsPerUserAgent[$wildCardUserAgent])) { |
||
85 | return ! $this->pathIsDenied($path, $this->disallowsPerUserAgent[$wildCardUserAgent]); |
||
86 | } |
||
87 | } |
||
88 | } |
||
89 | } |
||
90 | |||
91 | protected function pathIsDenied(string $path, array $rules, &$type = null) |
||
92 | { |
||
93 | foreach ($rules as $uri => $rule) { |
||
94 | $trimUri = rtrim($uri, '/'); |
||
95 | |||
96 | if (in_array($path, [$uri, $trimUri])) { |
||
97 | $type = 0; |
||
98 | |||
99 | return $rule; |
||
100 | } |
||
101 | |||
102 | if (! $this->concernsDirectory($uri)) { |
||
103 | $type = 0; |
||
104 | |||
105 | return $rule; |
||
106 | } |
||
107 | |||
108 | if ($this->isUrlInDirectory($path, $uri)) { |
||
109 | $type = 1; |
||
110 | |||
111 | return $rule; |
||
112 | } |
||
113 | } |
||
114 | } |
||
115 | |||
116 | protected function getDisallowsPerUserAgent(string $content): array |
||
117 | { |
||
118 | $lines = explode(PHP_EOL, $content); |
||
119 | |||
120 | $lines = array_filter($lines); |
||
121 | |||
122 | $disallowsPerUserAgent = []; |
||
123 | |||
124 | $currentUserAgent = null; |
||
125 | |||
126 | foreach ($lines as $line) { |
||
127 | if ($this->isUserAgentLine($line)) { |
||
128 | $disallowsPerUserAgent[$this->parseUserAgent($line)] = []; |
||
129 | |||
130 | $currentUserAgent = &$disallowsPerUserAgent[$this->parseUserAgent($line)]; |
||
131 | |||
132 | continue; |
||
133 | } |
||
134 | |||
135 | if ($currentUserAgent === null) { |
||
136 | continue; |
||
137 | } |
||
138 | |||
139 | $rule = null; |
||
140 | $disallowUrl = $this->parse($line, $rule); |
||
141 | |||
142 | if ($disallowUrl !== null) { // other than allow/disallow |
||
143 | $currentUserAgent[$disallowUrl] = $rule; |
||
144 | } |
||
145 | } |
||
146 | |||
147 | return $disallowsPerUserAgent; |
||
148 | } |
||
149 | |||
150 | protected function isUserAgentLine(string $line): bool |
||
151 | { |
||
152 | return preg_match('/^User-agent\s*:/i', trim($line)); |
||
153 | } |
||
154 | |||
155 | protected function parseUserAgent(string $line): string |
||
156 | { |
||
157 | return strtolower(trim(preg_replace('/^User-agent\s*:/i', '', trim($line)))); |
||
158 | } |
||
159 | |||
160 | protected function parse(string $line, &$type): ?string |
||
161 | { |
||
162 | $line = trim(preg_replace('/\s+!/', ':', $line)); |
||
163 | |||
164 | if (stripos($line, 'disallow:') === 0) { |
||
165 | $type = true; |
||
166 | |||
167 | return trim(preg_replace('/^disallow:/i', '', $line)); |
||
168 | } |
||
169 | |||
170 | if (stripos($line, 'allow:') === 0) { |
||
171 | $type = false; |
||
172 | |||
173 | return trim(preg_replace('/^allow:/i', '', $line)); |
||
174 | } |
||
175 | |||
176 | // else: could be crawl-delay, sitemap... |
||
177 | } |
||
178 | |||
179 | protected function concernsDirectory(string $path): bool |
||
183 | |||
184 | protected function isUrlInDirectory(string $url, string $path): bool |
||
188 | } |
||
189 |