1
|
|
|
<?php |
2
|
|
|
namespace vipnytt; |
3
|
|
|
|
4
|
|
|
/** |
5
|
|
|
* X-Robots-Tag HTTP header parser class |
6
|
|
|
* |
7
|
|
|
* @author VIP nytt ([email protected]) |
8
|
|
|
* @author Jan-Petter Gundersen ([email protected]) |
9
|
|
|
* |
10
|
|
|
* Project: |
11
|
|
|
* @link https://github.com/VIPnytt/X-Robots-Tag-parser |
12
|
|
|
* @license https://opensource.org/licenses/MIT MIT license |
13
|
|
|
* |
14
|
|
|
* Specification: |
15
|
|
|
* @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag#using-the-x-robots-tag-http-header |
16
|
|
|
*/ |
17
|
|
|
|
18
|
|
|
use vipnytt\XRobotsTagParser\Exceptions\XRobotsTagParserException; |
19
|
|
|
use vipnytt\XRobotsTagParser\Rebuild; |
20
|
|
|
use vipnytt\XRobotsTagParser\UserAgentParser; |
21
|
|
|
|
22
|
|
|
class XRobotsTagParser |
23
|
|
|
{ |
24
|
|
|
const HEADER_RULE_IDENTIFIER = 'X-Robots-Tag'; |
25
|
|
|
|
26
|
|
|
const DIRECTIVE_ALL = 'all'; |
27
|
|
|
const DIRECTIVE_NONE = 'none'; |
28
|
|
|
const DIRECTIVE_NO_ARCHIVE = 'noarchive'; |
29
|
|
|
const DIRECTIVE_NO_FOLLOW = 'nofollow'; |
30
|
|
|
const DIRECTIVE_NO_IMAGE_INDEX = 'noimageindex'; |
31
|
|
|
const DIRECTIVE_NO_INDEX = 'noindex'; |
32
|
|
|
const DIRECTIVE_NO_ODP = 'noodp'; |
33
|
|
|
const DIRECTIVE_NO_SNIPPET = 'nosnippet'; |
34
|
|
|
const DIRECTIVE_NO_TRANSLATE = 'notranslate'; |
35
|
|
|
const DIRECTIVE_UNAVAILABLE_AFTER = 'unavailable_after'; |
36
|
|
|
|
37
|
|
|
protected $userAgent = ''; |
38
|
|
|
protected $userAgentMatch = ''; |
39
|
|
|
|
40
|
|
|
protected $currentRule = ''; |
41
|
|
|
protected $currentUserAgent; |
42
|
|
|
|
43
|
|
|
protected $rules = []; |
44
|
|
|
|
45
|
|
|
/** |
46
|
|
|
* Constructor |
47
|
|
|
* |
48
|
|
|
* @param string $userAgent |
49
|
|
|
* @param array $headers |
50
|
|
|
*/ |
51
|
|
|
public function __construct($userAgent = '', $headers = null) |
52
|
|
|
{ |
53
|
|
|
$this->userAgent = $userAgent; |
54
|
|
|
if (isset($headers)) { |
55
|
|
|
$this->parse($headers); |
56
|
|
|
} |
57
|
|
|
} |
58
|
|
|
|
59
|
|
|
/** |
60
|
|
|
* Parse HTTP headers |
61
|
|
|
* |
62
|
|
|
* @param array $headers |
63
|
|
|
* @return void |
64
|
|
|
*/ |
65
|
|
|
public function parse(array $headers) |
66
|
|
|
{ |
67
|
|
|
foreach ($headers as $header) { |
68
|
|
|
$parts = array_map('trim', explode(':', mb_strtolower($header), 2)); |
69
|
|
|
if (count($parts) < 2 || $parts[0] != mb_strtolower(self::HEADER_RULE_IDENTIFIER)) { |
70
|
|
|
// Header is not a rule |
71
|
|
|
continue; |
72
|
|
|
} |
73
|
|
|
$this->currentRule = $parts[1]; |
74
|
|
|
$this->detectDirectives(); |
75
|
|
|
} |
76
|
|
|
$userAgentParser = new UserAgentParser($this->userAgent); |
77
|
|
|
$this->userAgentMatch = $userAgentParser->match(array_keys($this->rules), ''); |
|
|
|
|
78
|
|
|
} |
79
|
|
|
|
80
|
|
|
/** |
81
|
|
|
* Detect directives in rule |
82
|
|
|
* |
83
|
|
|
* @return void |
84
|
|
|
*/ |
85
|
|
|
protected function detectDirectives() |
86
|
|
|
{ |
87
|
|
|
$directives = array_map('trim', explode(',', $this->currentRule)); |
88
|
|
|
$pair = array_map('trim', explode(':', $directives[0], 2)); |
89
|
|
|
if (count($pair) == 2 && !in_array($pair[0], array_keys($this->directiveClasses()))) { |
90
|
|
|
$this->currentUserAgent = $pair[0]; |
91
|
|
|
$directives[0] = $pair[1]; |
92
|
|
|
} |
93
|
|
|
foreach ($directives as $rule) { |
94
|
|
|
$directive = trim(explode(':', $rule, 2)[0]); |
95
|
|
|
if (in_array($directive, array_keys($this->directiveClasses()))) { |
96
|
|
|
$this->addRule($directive); |
97
|
|
|
} |
98
|
|
|
} |
99
|
|
|
$this->cleanup(); |
100
|
|
|
} |
101
|
|
|
|
102
|
|
|
/** |
103
|
|
|
* Array of directives and their class names |
104
|
|
|
* |
105
|
|
|
* @return array |
106
|
|
|
*/ |
107
|
|
|
protected function directiveClasses() |
108
|
|
|
{ |
109
|
|
|
return [ |
110
|
|
|
self::DIRECTIVE_ALL => 'All', |
111
|
|
|
self::DIRECTIVE_NO_ARCHIVE => 'NoArchive', |
112
|
|
|
self::DIRECTIVE_NO_FOLLOW => 'NoFollow', |
113
|
|
|
self::DIRECTIVE_NO_IMAGE_INDEX => 'NoImageIndex', |
114
|
|
|
self::DIRECTIVE_NO_INDEX => 'NoIndex', |
115
|
|
|
self::DIRECTIVE_NONE => 'None', |
116
|
|
|
self::DIRECTIVE_NO_ODP => 'NoODP', |
117
|
|
|
self::DIRECTIVE_NO_SNIPPET => 'NoSnippet', |
118
|
|
|
self::DIRECTIVE_NO_TRANSLATE => 'NoTranslate', |
119
|
|
|
self::DIRECTIVE_UNAVAILABLE_AFTER => 'UnavailableAfter', |
120
|
|
|
]; |
121
|
|
|
} |
122
|
|
|
|
123
|
|
|
/** |
124
|
|
|
* Add rule |
125
|
|
|
* |
126
|
|
|
* @param string $directive |
127
|
|
|
* @return void |
128
|
|
|
* @throws XRobotsTagParserException |
129
|
|
|
*/ |
130
|
|
|
protected function addRule($directive) |
131
|
|
|
{ |
132
|
|
|
if (!isset($this->rules[$this->currentUserAgent])) { |
133
|
|
|
$this->rules[$this->currentUserAgent] = []; |
134
|
|
|
} |
135
|
|
|
$class = "\\" . __CLASS__ . "\\directives\\" . $this->directiveClasses()[$directive]; |
136
|
|
|
$object = new $class($this->currentRule); |
137
|
|
|
if (!$object instanceof XRobotsTagParser\directives\directiveInterface) { |
138
|
|
|
throw new XRobotsTagParserException('Unsupported directive class'); |
139
|
|
|
} |
140
|
|
|
$this->rules[$this->currentUserAgent] = array_merge($this->rules[$this->currentUserAgent], [$object->getDirective() => $object->getValue()]); |
141
|
|
|
} |
142
|
|
|
|
143
|
|
|
/** |
144
|
|
|
* Cleanup before next rule is read |
145
|
|
|
* |
146
|
|
|
* @return void |
147
|
|
|
*/ |
148
|
|
|
protected function cleanup() |
149
|
|
|
{ |
150
|
|
|
$this->currentRule = ''; |
151
|
|
|
$this->currentUserAgent = ''; |
152
|
|
|
} |
153
|
|
|
|
154
|
|
|
/** |
155
|
|
|
* Return all applicable rules |
156
|
|
|
* |
157
|
|
|
* @param bool $raw |
158
|
|
|
* @return array |
159
|
|
|
*/ |
160
|
|
|
public function getRules($raw = false) |
161
|
|
|
{ |
162
|
|
|
$rules = []; |
163
|
|
|
// Default UserAgent |
164
|
|
|
if (isset($this->rules[''])) { |
165
|
|
|
$rules = array_merge($rules, $this->rules['']); |
166
|
|
|
} |
167
|
|
|
// Matching UserAgent |
168
|
|
|
if (isset($this->rules[$this->userAgentMatch])) { |
169
|
|
|
$rules = array_merge($rules, $this->rules[$this->userAgentMatch]); |
170
|
|
|
} |
171
|
|
|
if (!$raw) { |
172
|
|
|
$rebuild = new Rebuild($rules); |
173
|
|
|
$rules = $rebuild->getResult(); |
174
|
|
|
} |
175
|
|
|
// Result |
176
|
|
|
return $rules; |
177
|
|
|
} |
178
|
|
|
|
179
|
|
|
/** |
180
|
|
|
* Export all rules for all UserAgents |
181
|
|
|
* |
182
|
|
|
* @return array |
183
|
|
|
*/ |
184
|
|
|
public function export() |
185
|
|
|
{ |
186
|
|
|
return $this->rules; |
187
|
|
|
} |
188
|
|
|
|
189
|
|
|
/** |
190
|
|
|
* Get the meaning of an Directive |
191
|
|
|
* |
192
|
|
|
* @param string $directive |
193
|
|
|
* @return string |
194
|
|
|
* @throws XRobotsTagParserException |
195
|
|
|
*/ |
196
|
|
|
public function getDirectiveMeaning($directive) |
197
|
|
|
{ |
198
|
|
|
if (!in_array($directive, array_keys($this->directiveClasses()))) { |
199
|
|
|
throw new XRobotsTagParserException('Unknown directive'); |
200
|
|
|
} |
201
|
|
|
$class = "\\" . __CLASS__ . "\\directives\\" . $this->directiveClasses()[$directive]; |
202
|
|
|
$object = new $class($this->directiveClasses()[$directive]); |
203
|
|
|
if (!$object instanceof XRobotsTagParser\directives\directiveInterface) { |
204
|
|
|
throw new XRobotsTagParserException('Unsupported directive class'); |
205
|
|
|
} |
206
|
|
|
return $object->getMeaning(); |
207
|
|
|
} |
208
|
|
|
} |
209
|
|
|
|
Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.
For example, imagine you have a variable
$accountId
that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to theid
property of an instance of theAccount
class. This class holds a proper account, so the id value must no longer be false.Either this assignment is in error or a type check should be added for that assignment.