1 | <?php |
||
5 | class RobotsTxt |
||
6 | { |
||
7 | protected static $robotsCache = []; |
||
8 | |||
9 | protected $disallowsPerUserAgent = []; |
||
10 | |||
11 | public static function readFrom(string $source): self |
||
17 | |||
18 | public function __construct(string $content) |
||
22 | |||
23 | public static function create(string $source): self |
||
34 | |||
35 | // Google says (https://developers.google.com/search/reference/robots_txt) : |
||
36 | // Only one group of group-member records is valid for a particular crawler. |
||
37 | // The crawler must determine the correct group of records by finding the group |
||
38 | // with the most specific user-agent that still matches. |
||
39 | public function allows(string $url, ?string $userAgent = '*'): bool |
||
59 | |||
60 | protected function getWildCardUserAgent(string $userAgent): ?string |
||
61 | { |
||
62 | $wildcardUserAgents = array_filter(array_keys($this->disallowsPerUserAgent), function($userAgent) { |
||
63 | return strpos($userAgent, '*') !== false; |
||
64 | }); |
||
65 | foreach ($wildcardUserAgents as $wildcardUserAgent) { |
||
66 | if (strpos($userAgent, substr($wildcardUserAgent, 0, -1)) === 0) { |
||
67 | return $wildcardUserAgent; |
||
68 | } |
||
69 | } |
||
70 | |||
71 | return null; |
||
72 | } |
||
73 | |||
74 | protected function pathIsDenied(string $path, array $rules) |
||
82 | |||
83 | protected function match($pattern, $string) |
||
92 | |||
93 | protected function getDisallowsPerUserAgent(string $content): array |
||
124 | |||
125 | // Google says (https://developers.google.com/search/reference/robots_txt) : |
||
126 | // At a group-member level, in particular for allow and disallow directives, the most specific rule |
||
127 | // based on the length of the [path] entry will trump the less specific (shorter) rule. The order of |
||
128 | // precedence for rules with wildcards is undefined. |
||
129 | protected function orderRules(array $disallowsPerUserAgent): array |
||
138 | |||
139 | protected function isUserAgentLine(string $line): bool |
||
143 | |||
144 | protected function parseUserAgent(string $line): string |
||
148 | |||
149 | protected function parse(string $line): ?array |
||
163 | |||
164 | protected function isUrlInDirectory(string $url, string $path): bool |
||
168 | } |
||
169 |