GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Completed
Pull Request — master (#13)
by Dev
01:11
created

RobotsTxt::getWildCardUserAgent()   A

Complexity

Conditions 4
Paths 3

Size

Total Lines 13

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 13
rs 9.8333
c 0
b 0
f 0
cc 4
nc 3
nop 1
1
<?php
2
3
namespace Spatie\Robots;
4
5
class RobotsTxt
6
{
7
    protected static $robotsCache = [];
8
9
    protected $disallowsPerUserAgent = [];
10
11
    public static function readFrom(string $source): self
12
    {
13
        $content = @file_get_contents($source) ?? '';
14
15
        return new self($content);
16
    }
17
18
    public function __construct(string $content)
19
    {
20
        $this->disallowsPerUserAgent = $this->getDisallowsPerUserAgent($content);
21
    }
22
23
    public static function create(string $source): self
24
    {
25
        if (
26
            strpos($source, 'http') !== false
27
            && strpos($source, 'robots.txt') !== false
28
        ) {
29
            return self::readFrom($source);
30
        }
31
32
        return new self($source);
33
    }
34
35
    // Google says (https://developers.google.com/search/reference/robots_txt) :
36
    // Only one group of group-member records is valid for a particular crawler.
37
    // The crawler must determine the correct group of records by finding the group
38
    // with the most specific user-agent that still matches.
39
    public function allows(string $url, ?string $userAgent = '*'): bool
40
    {
41
        $userAgent = strtolower($userAgent);
42
43
        if ($userAgent === null) {
44
            $userAgent = '*';
45
        }
46
47
        $path = parse_url($url, PHP_URL_PATH) ?? '';
48
49
        if ($userAgent != '*' && isset($this->disallowsPerUserAgent[$userAgent])) {
50
            return ! $this->pathIsDenied($path, $this->disallowsPerUserAgent[$userAgent]) ?? true;
51
        } elseif ($userAgent != '*' && $wildCardUserAgent = $this->getWildCardUserAgent($userAgent)) {
52
            return ! $this->pathIsDenied($path, $this->disallowsPerUserAgent[$wildCardUserAgent]) ?? true;
53
        } elseif (isset($this->disallowsPerUserAgent['*'])) {
54
            return ! $this->pathIsDenied($path, $this->disallowsPerUserAgent['*']) ?? true;
55
        }
56
57
        return true;
58
    }
59
60
    protected function getWildCardUserAgent(string $userAgent): ?string
61
    {
62
        $wildcardUserAgents = array_filter(array_keys($this->disallowsPerUserAgent), function ($userAgent) {
63
            return $userAgent != '*' && strpos($userAgent, '*') !== false;
64
        });
65
        foreach ($wildcardUserAgents as $wildcardUserAgent) {
66
            if (strpos($userAgent, substr($wildcardUserAgent, 0, -1)) === 0) {
67
                return $wildcardUserAgent;
68
            }
69
        }
70
71
        return null;
72
    }
73
74
    protected function pathIsDenied(string $path, array $rules)
75
    {
76
        foreach ($rules as $pattern => $rule) {
77
            if ($this->match($pattern, $path)) {
78
                return $rule;
79
            }
80
        }
81
    }
82
83
    protected function match($pattern, $string)
84
    {
85
        $pattern = preg_quote($pattern, '/');
86
        $pattern = str_replace('\*', '.*', $pattern);
87
        $pattern = substr($pattern, -2) == '\$' ? substr($pattern, 0, strlen($pattern) - 2).'$' : $pattern;
88
        $pattern = preg_replace('/\/$/', '/?', $pattern);
89
90
        return (bool) preg_match('/^'.$pattern.'/', $string);
91
    }
92
93
    protected function getDisallowsPerUserAgent(string $content): array
94
    {
95
        $lines = explode(PHP_EOL, $content);
96
97
        $lines = array_filter($lines);
98
99
        $disallowsPerUserAgent = [];
100
101
        $currentUserAgent = null;
102
103
        foreach ($lines as $line) {
104
            if ($this->isUserAgentLine($line)) {
105
                $disallowsPerUserAgent[$this->parseUserAgent($line)] = [];
106
107
                $currentUserAgent = &$disallowsPerUserAgent[$this->parseUserAgent($line)];
108
109
                continue;
110
            }
111
112
            if ($currentUserAgent === null) {
113
                continue;
114
            }
115
116
            list($pattern, $rule) = $this->parse($line);
117
            if ($pattern !== null) { // other than allow/disallow
118
                $currentUserAgent[$pattern] = $rule;
119
            }
120
        }
121
122
        return $this->orderRules($disallowsPerUserAgent);
123
    }
124
125
    // Google says (https://developers.google.com/search/reference/robots_txt) :
126
    // At a group-member level, in particular for allow and disallow directives, the most specific rule
127
    // based on the length of the [path] entry will trump the less specific (shorter) rule. The order of
128
    // precedence for rules with wildcards is undefined.
129
    protected function orderRules(array $disallowsPerUserAgent): array
130
    {
131
        foreach ($disallowsPerUserAgent as $userAgent => $rules) {
132
            array_multisort(array_map('strlen', array_keys($rules)), SORT_DESC, $rules);
0 ignored issues
show
Bug introduced by
array_map('strlen', array_keys($rules)) cannot be passed to array_multisort() as the parameter $arr expects a reference.
Loading history...
133
            $disallowsPerUserAgent[$userAgent] = $rules;
134
        }
135
136
        return $disallowsPerUserAgent;
137
    }
138
139
    protected function isUserAgentLine(string $line): bool
140
    {
141
        return stripos(str_replace(' ', '', $line), 'user-agent:') === 0;
142
    }
143
144
    protected function parseUserAgent(string $line): string
145
    {
146
        return strtolower(trim(preg_replace('/^User-agent\s*:/i', '', trim($line))));
147
    }
148
149
    protected function parse(string $line): ?array
150
    {
151
        $line = trim(preg_replace('/\s+!/', ':', $line));
152
153
        if (stripos($line, 'disallow:') === 0) {
154
            return [trim(preg_replace('/^disallow:/i', '', $line)), true];
155
        }
156
157
        if (stripos($line, 'allow:') === 0) {
158
            return [trim(preg_replace('/^allow:/i', '', $line)), false];
159
        }
160
161
        return null; // else: could be crawl-delay, sitemap...
162
    }
163
164
    protected function isUrlInDirectory(string $url, string $path): bool
165
    {
166
        return strpos($url, $path) === 0;
167
    }
168
}
169