GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Completed
Pull Request — master (#13)
by Dev
01:10
created

RobotsTxt::isCommentLine()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
nc 1
nop 1
dl 0
loc 4
rs 10
c 0
b 0
f 0
1
<?php
2
3
namespace Spatie\Robots;
4
5
class RobotsTxt
6
{
7
    protected static $robotsCache = [];
8
9
    protected $disallowsPerUserAgent = [];
10
11
    public static function readFrom(string $source): self
12
    {
13
        $content = @file_get_contents($source) ?? '';
14
15
        return new self($content);
16
    }
17
18
    public function __construct(string $content)
19
    {
20
        $this->disallowsPerUserAgent = $this->getDisallowsPerUserAgent($content);
21
    }
22
23
    public static function create(string $source): self
24
    {
25
        if (
26
            strpos($source, 'http') !== false
27
            && strpos($source, 'robots.txt') !== false
28
        ) {
29
            return self::readFrom($source);
30
        }
31
32
        return new self($source);
33
    }
34
35
    // Google says (https://developers.google.com/search/reference/robots_txt) :
36
    // Only one group of group-member records is valid for a particular crawler.
37
    // The crawler must determine the correct group of records by finding the group
38
    // with the most specific user-agent that still matches.
39
    public function allows(string $url, ?string $userAgent = '*'): bool
40
    {
41
        $userAgent = strtolower($userAgent);
42
43
        if ($userAgent === null) {
44
            $userAgent = '*';
45
        }
46
47
        $path = parse_url($url, PHP_URL_PATH) ?? '';
48
49
50
        if ($userAgent != '*' && isset($this->disallowsPerUserAgent[$userAgent])) {
51
            return ! $this->pathIsDenied($path, $this->disallowsPerUserAgent[$userAgent]) ?? true;
52
        }
53
54
        elseif ($userAgent != '*' &&  $wildCardUserAgent = $this->getWildCardUserAgent($userAgent)) {
55
            return ! $this->pathIsDenied($path, $this->disallowsPerUserAgent[$wildCardUserAgent]) ?? true;
56
        }
57
58
        elseif (isset($this->disallowsPerUserAgent['*'])) {
59
            return ! $this->pathIsDenied($path, $this->disallowsPerUserAgent['*']) ?? true;
60
        }
61
62
        return true;
63
    }
64
65
    protected function getWildCardUserAgent(string $userAgent): ?string
66
    {
67
        if ($userAgent !== '*') {
68
            for ($i = 1; $i <= strlen($userAgent); $i++) {
69
                $wildCardUserAgent = substr($userAgent, 0, $i).'*';
70
                if (isset($this->disallowsPerUserAgent[$wildCardUserAgent])) {
71
                    return $wildCardUserAgent;
72
                }
73
            }
74
        }
75
76
        return null;
77
    }
78
79
    protected function pathIsDenied(string $path, array $rules)
80
    {
81
        foreach ($rules as $pattern => $rule) {
82
            if ($this->match($pattern, $path)) {
83
                return $rule;
84
            }
85
        }
86
    }
87
88
    protected function complexRule($path): boolean
89
    {
90
        return strpos($path, ['$', '*']);
91
    }
92
93
    protected function match($pattern, $string) {
94
        //$pattern = '/fr/*/$';
95
        $pattern = preg_quote($pattern, '/');
96
        $pattern = str_replace('\*', '.*', $pattern);
97
        //$pattern = preg_replace('/\\\$$/', '$', $pattern); // is not working
98
        $pattern = substr($pattern, -2) == '\$' ? substr($pattern, 0, strlen($pattern)-2).'$' : $pattern;
99
        $pattern = preg_replace('/\/$/', '/?', $pattern);
100
101
        return (boolean) preg_match('/^'.$pattern.'/', $string);
102
    }
103
104
    protected function getDisallowsPerUserAgent(string $content): array
105
    {
106
        $lines = explode(PHP_EOL, $content);
107
108
        $lines = array_filter($lines);
109
110
        $disallowsPerUserAgent = [];
111
112
        $currentUserAgent = null;
113
114
        foreach ($lines as $line) {
115
            if ($this->isUserAgentLine($line)) {
116
                $disallowsPerUserAgent[$this->parseUserAgent($line)] = [];
117
118
                $currentUserAgent = &$disallowsPerUserAgent[$this->parseUserAgent($line)];
119
120
                continue;
121
            }
122
123
            if ($currentUserAgent === null) {
124
                continue;
125
            }
126
127
            list($pattern, $rule) = $this->parse($line);
128
            if ($pattern !== null) { // other than allow/disallow
129
                $currentUserAgent[$pattern] = $rule;
130
            }
131
        }
132
133
        return $this->orderRules($disallowsPerUserAgent);
134
    }
135
136
    // Google says (https://developers.google.com/search/reference/robots_txt) :
137
    // At a group-member level, in particular for allow and disallow directives, the most specific rule
138
    // based on the length of the [path] entry will trump the less specific (shorter) rule. The order of
139
    // precedence for rules with wildcards is undefined.
140
    protected function orderRules(array $disallowsPerUserAgent): array
141
    {
142
        foreach ($disallowsPerUserAgent as $userAgent => $rules) {
143
            array_multisort(array_map('strlen', array_keys($rules)), SORT_DESC, $rules);
0 ignored issues
show
Bug introduced by
array_map('strlen', array_keys($rules)) cannot be passed to array_multisort() as the parameter $arr expects a reference.
Loading history...
144
            $disallowsPerUserAgent[$userAgent] = $rules;
145
        }
146
147
        return $disallowsPerUserAgent;
148
    }
149
150
    protected function isUserAgentLine(string $line): bool
151
    {
152
        return stripos(str_replace(' ', '', $line), 'user-agent:') === 0;
153
    }
154
155
    protected function parseUserAgent(string $line): string
156
    {
157
        return strtolower(trim(preg_replace('/^User-agent\s*:/i', '', trim($line))));
158
    }
159
160
    protected function parse(string $line): ?array
161
    {
162
        $line = trim(preg_replace('/\s+!/', ':', $line));
163
164
        if (stripos($line, 'disallow:') === 0) {
165
            return [trim(preg_replace('/^disallow:/i', '', $line)), true];
166
        }
167
168
        if (stripos($line, 'allow:') === 0) {
169
            return [trim(preg_replace('/^allow:/i', '', $line)), false];
170
        }
171
172
        // else: could be crawl-delay, sitemap...
173
    }
174
175
    protected function isUrlInDirectory(string $url, string $path): bool
176
    {
177
        return strpos($url, $path) === 0;
178
    }
179
}
180