GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Completed
Pull Request — master (#13)
by Dev
01:24 queued 10s
created

RobotsTxt::parse()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 14
rs 9.7998
c 0
b 0
f 0
cc 3
nc 3
nop 1
1
<?php
2
3
namespace Spatie\Robots;
4
5
class RobotsTxt
6
{
7
    protected static $robotsCache = [];
8
9
    protected $disallowsPerUserAgent = [];
10
11
    public static function readFrom(string $source): self
12
    {
13
        $content = @file_get_contents($source) ?? '';
14
15
        return new self($content);
16
    }
17
18
    public function __construct(string $content)
19
    {
20
        $this->disallowsPerUserAgent = $this->getDisallowsPerUserAgent($content);
21
    }
22
23
    public static function create(string $source): self
24
    {
25
        if (
26
            strpos($source, 'http') !== false
27
            && strpos($source, 'robots.txt') !== false
28
        ) {
29
            return self::readFrom($source);
30
        }
31
32
        return new self($source);
33
    }
34
35
    // Google says (https://developers.google.com/search/reference/robots_txt) :
36
    // Only one group of group-member records is valid for a particular crawler.
37
    // The crawler must determine the correct group of records by finding the group
38
    // with the most specific user-agent that still matches.
39
    public function allows(string $url, ?string $userAgent = '*'): bool
40
    {
41
        $userAgent = strtolower($userAgent);
42
43
        if ($userAgent === null) {
44
            $userAgent = '*';
45
        }
46
47
        $path = parse_url($url, PHP_URL_PATH) ?? '';
48
49
        if ($userAgent != '*' && isset($this->disallowsPerUserAgent[$userAgent])) {
50
            return ! $this->pathIsDenied($path, $this->disallowsPerUserAgent[$userAgent]) ?? true;
51
        } elseif ($userAgent != '*' && $wildCardUserAgent = $this->getWildCardUserAgent($userAgent)) {
52
            return ! $this->pathIsDenied($path, $this->disallowsPerUserAgent[$wildCardUserAgent]) ?? true;
53
        } elseif (isset($this->disallowsPerUserAgent['*'])) {
54
            return ! $this->pathIsDenied($path, $this->disallowsPerUserAgent['*']) ?? true;
55
        }
56
57
        return true;
58
    }
59
60
    protected function getWildCardUserAgent(string $userAgent): ?string
61
    {
62
        if ($userAgent !== '*') {
63
            for ($i = 1; $i <= strlen($userAgent); $i++) {
64
                $wildCardUserAgent = substr($userAgent, 0, $i).'*';
65
                if (isset($this->disallowsPerUserAgent[$wildCardUserAgent])) {
66
                    return $wildCardUserAgent;
67
                }
68
            }
69
        }
70
71
        return null;
72
    }
73
74
    protected function pathIsDenied(string $path, array $rules)
75
    {
76
        foreach ($rules as $pattern => $rule) {
77
            if ($this->match($pattern, $path)) {
78
                return $rule;
79
            }
80
        }
81
    }
82
83
    protected function complexRule($path): boolean
84
    {
85
        return strpos($path, ['$', '*']);
86
    }
87
88
    protected function match($pattern, $string)
89
    {
90
        $pattern = preg_quote($pattern, '/');
91
        $pattern = str_replace('\*', '.*', $pattern);
92
        //$pattern = preg_replace('/\\\$$/', '$', $pattern); // is not working
93
        $pattern = substr($pattern, -2) == '\$' ? substr($pattern, 0, strlen($pattern) - 2).'$' : $pattern;
94
        $pattern = preg_replace('/\/$/', '/?', $pattern);
95
96
        return (bool) preg_match('/^'.$pattern.'/', $string);
97
    }
98
99
    protected function getDisallowsPerUserAgent(string $content): array
100
    {
101
        $lines = explode(PHP_EOL, $content);
102
103
        $lines = array_filter($lines);
104
105
        $disallowsPerUserAgent = [];
106
107
        $currentUserAgent = null;
108
109
        foreach ($lines as $line) {
110
            if ($this->isUserAgentLine($line)) {
111
                $disallowsPerUserAgent[$this->parseUserAgent($line)] = [];
112
113
                $currentUserAgent = &$disallowsPerUserAgent[$this->parseUserAgent($line)];
114
115
                continue;
116
            }
117
118
            if ($currentUserAgent === null) {
119
                continue;
120
            }
121
122
            list($pattern, $rule) = $this->parse($line);
123
            if ($pattern !== null) { // other than allow/disallow
124
                $currentUserAgent[$pattern] = $rule;
125
            }
126
        }
127
128
        return $this->orderRules($disallowsPerUserAgent);
129
    }
130
131
    // Google says (https://developers.google.com/search/reference/robots_txt) :
132
    // At a group-member level, in particular for allow and disallow directives, the most specific rule
133
    // based on the length of the [path] entry will trump the less specific (shorter) rule. The order of
134
    // precedence for rules with wildcards is undefined.
135
    protected function orderRules(array $disallowsPerUserAgent): array
136
    {
137
        foreach ($disallowsPerUserAgent as $userAgent => $rules) {
138
            array_multisort(array_map('strlen', array_keys($rules)), SORT_DESC, $rules);
0 ignored issues
show
Bug introduced by
array_map('strlen', array_keys($rules)) cannot be passed to array_multisort() as the parameter $arr expects a reference.
Loading history...
139
            $disallowsPerUserAgent[$userAgent] = $rules;
140
        }
141
142
        return $disallowsPerUserAgent;
143
    }
144
145
    protected function isUserAgentLine(string $line): bool
146
    {
147
        return stripos(str_replace(' ', '', $line), 'user-agent:') === 0;
148
    }
149
150
    protected function parseUserAgent(string $line): string
151
    {
152
        return strtolower(trim(preg_replace('/^User-agent\s*:/i', '', trim($line))));
153
    }
154
155
    protected function parse(string $line): ?array
156
    {
157
        $line = trim(preg_replace('/\s+!/', ':', $line));
158
159
        if (stripos($line, 'disallow:') === 0) {
160
            return [trim(preg_replace('/^disallow:/i', '', $line)), true];
161
        }
162
163
        if (stripos($line, 'allow:') === 0) {
164
            return [trim(preg_replace('/^allow:/i', '', $line)), false];
165
        }
166
167
        // else: could be crawl-delay, sitemap...
168
    }
169
170
    protected function isUrlInDirectory(string $url, string $path): bool
171
    {
172
        return strpos($url, $path) === 0;
173
    }
174
}
175