GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Completed
Pull Request — master (#13)
by Dev
01:25
created

RobotsTxt::getWildCardUserAgent()   A

Complexity

Conditions 4
Paths 3

Size

Total Lines 13

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 13
rs 9.8333
c 0
b 0
f 0
cc 4
nc 3
nop 1
1
<?php
2
3
namespace Spatie\Robots;
4
5
class RobotsTxt
6
{
7
    protected static $robotsCache = [];
8
9
    protected $disallowsPerUserAgent = [];
10
11
    public static function readFrom(string $source): self
12
    {
13
        $content = @file_get_contents($source) ?? '';
14
15
        return new self($content);
16
    }
17
18
    public function __construct(string $content)
19
    {
20
        $this->disallowsPerUserAgent = $this->getDisallowsPerUserAgent($content);
21
    }
22
23
    public static function create(string $source): self
24
    {
25
        if (
26
            strpos($source, 'http') !== false
27
            && strpos($source, 'robots.txt') !== false
28
        ) {
29
            return self::readFrom($source);
30
        }
31
32
        return new self($source);
33
    }
34
35
    // Google says (https://developers.google.com/search/reference/robots_txt) :
36
    // Only one group of group-member records is valid for a particular crawler.
37
    // The crawler must determine the correct group of records by finding the group
38
    // with the most specific user-agent that still matches.
39
    public function allows(string $url, ?string $userAgent = '*'): bool
40
    {
41
        $userAgent = strtolower($userAgent);
42
43
        if ($userAgent === null) {
44
            $userAgent = '*';
45
        }
46
47
        $path = parse_url($url, PHP_URL_PATH) ?? '';
48
49
50
        if ($userAgent != '*' && isset($this->disallowsPerUserAgent[$userAgent])) {
51
            return ! $this->pathIsDenied($path, $this->disallowsPerUserAgent[$userAgent]) ?? true;
52
        } elseif ($userAgent != '*' &&  $wildCardUserAgent = $this->getWildCardUserAgent($userAgent)) {
53
            return ! $this->pathIsDenied($path, $this->disallowsPerUserAgent[$wildCardUserAgent]) ?? true;
54
        } elseif (isset($this->disallowsPerUserAgent['*'])) {
55
            return ! $this->pathIsDenied($path, $this->disallowsPerUserAgent['*']) ?? true;
56
        }
57
58
        return true;
59
    }
60
61
    protected function getWildCardUserAgent(string $userAgent): ?string
62
    {
63
        if ($userAgent !== '*') {
64
            for ($i = 1; $i <= strlen($userAgent); $i++) {
65
                $wildCardUserAgent = substr($userAgent, 0, $i).'*';
66
                if (isset($this->disallowsPerUserAgent[$wildCardUserAgent])) {
67
                    return $wildCardUserAgent;
68
                }
69
            }
70
        }
71
72
        return null;
73
    }
74
75
    protected function pathIsDenied(string $path, array $rules)
76
    {
77
        foreach ($rules as $pattern => $rule) {
78
            if ($this->match($pattern, $path)) {
79
                return $rule;
80
            }
81
        }
82
    }
83
84
    protected function complexRule($path): boolean
85
    {
86
        return strpos($path, ['$', '*']);
87
    }
88
89
    protected function match($pattern, $string)
90
    {
91
        $pattern = preg_quote($pattern, '/');
92
        $pattern = str_replace('\*', '.*', $pattern);
93
        //$pattern = preg_replace('/\\\$$/', '$', $pattern); // is not working
94
        $pattern = substr($pattern, -2) == '\$' ? substr($pattern, 0, strlen($pattern) - 2).'$' : $pattern;
95
        $pattern = preg_replace('/\/$/', '/?', $pattern);
96
97
        return (bool) preg_match('/^'.$pattern.'/', $string);
98
    }
99
100
    protected function getDisallowsPerUserAgent(string $content): array
101
    {
102
        $lines = explode(PHP_EOL, $content);
103
104
        $lines = array_filter($lines);
105
106
        $disallowsPerUserAgent = [];
107
108
        $currentUserAgent = null;
109
110
        foreach ($lines as $line) {
111
            if ($this->isUserAgentLine($line)) {
112
                $disallowsPerUserAgent[$this->parseUserAgent($line)] = [];
113
114
                $currentUserAgent = &$disallowsPerUserAgent[$this->parseUserAgent($line)];
115
116
                continue;
117
            }
118
119
            if ($currentUserAgent === null) {
120
                continue;
121
            }
122
123
            list($pattern, $rule) = $this->parse($line);
124
            if ($pattern !== null) { // other than allow/disallow
125
                $currentUserAgent[$pattern] = $rule;
126
            }
127
        }
128
129
        return $this->orderRules($disallowsPerUserAgent);
130
    }
131
132
    // Google says (https://developers.google.com/search/reference/robots_txt) :
133
    // At a group-member level, in particular for allow and disallow directives, the most specific rule
134
    // based on the length of the [path] entry will trump the less specific (shorter) rule. The order of
135
    // precedence for rules with wildcards is undefined.
136
    protected function orderRules(array $disallowsPerUserAgent): array
137
    {
138
        foreach ($disallowsPerUserAgent as $userAgent => $rules) {
139
            array_multisort(array_map('strlen', array_keys($rules)), SORT_DESC, $rules);
0 ignored issues
show
Bug introduced by
array_map('strlen', array_keys($rules)) cannot be passed to array_multisort() as the parameter $arr expects a reference.
Loading history...
140
            $disallowsPerUserAgent[$userAgent] = $rules;
141
        }
142
143
        return $disallowsPerUserAgent;
144
    }
145
146
    protected function isUserAgentLine(string $line): bool
147
    {
148
        return stripos(str_replace(' ', '', $line), 'user-agent:') === 0;
149
    }
150
151
    protected function parseUserAgent(string $line): string
152
    {
153
        return strtolower(trim(preg_replace('/^User-agent\s*:/i', '', trim($line))));
154
    }
155
156
    protected function parse(string $line): ?array
157
    {
158
        $line = trim(preg_replace('/\s+!/', ':', $line));
159
160
        if (stripos($line, 'disallow:') === 0) {
161
            return [trim(preg_replace('/^disallow:/i', '', $line)), true];
162
        }
163
164
        if (stripos($line, 'allow:') === 0) {
165
            return [trim(preg_replace('/^allow:/i', '', $line)), false];
166
        }
167
168
        // else: could be crawl-delay, sitemap...
169
    }
170
171
    protected function isUrlInDirectory(string $url, string $path): bool
172
    {
173
        return strpos($url, $path) === 0;
174
    }
175
}
176