|
1
|
|
|
<?php |
|
2
|
|
|
namespace vipnytt\RobotsTxtParser\Parser\Directives; |
|
3
|
|
|
|
|
4
|
|
|
use vipnytt\RobotsTxtParser\Exceptions; |
|
5
|
|
|
use vipnytt\RobotsTxtParser\Parser\UrlParser; |
|
6
|
|
|
use vipnytt\RobotsTxtParser\RobotsTxtInterface; |
|
7
|
|
|
|
|
8
|
|
|
/** |
|
9
|
|
|
* Class DisAllowParser |
|
10
|
|
|
* |
|
11
|
|
|
* @package vipnytt\RobotsTxtParser\Parser\Directives |
|
12
|
|
|
*/ |
|
13
|
|
|
class DisAllowParser implements ParserInterface, RobotsTxtInterface |
|
14
|
|
|
{ |
|
15
|
|
|
use DirectiveParserCommons; |
|
16
|
|
|
use UrlParser; |
|
17
|
|
|
|
|
18
|
|
|
/** |
|
19
|
|
|
* Directive alternatives |
|
20
|
|
|
*/ |
|
21
|
|
|
const DIRECTIVE = [ |
|
22
|
|
|
self::DIRECTIVE_ALLOW, |
|
23
|
|
|
self::DIRECTIVE_DISALLOW, |
|
24
|
|
|
]; |
|
25
|
|
|
|
|
26
|
|
|
/** |
|
27
|
|
|
* Sub directives white list |
|
28
|
|
|
*/ |
|
29
|
|
|
const SUB_DIRECTIVES = [ |
|
30
|
|
|
self::DIRECTIVE_CLEAN_PARAM, |
|
31
|
|
|
self::DIRECTIVE_HOST, |
|
32
|
|
|
]; |
|
33
|
|
|
|
|
34
|
|
|
/** |
|
35
|
|
|
* Directive |
|
36
|
|
|
* @var string |
|
37
|
|
|
*/ |
|
38
|
|
|
private $directive; |
|
39
|
|
|
|
|
40
|
|
|
/** |
|
41
|
|
|
* Base Uri |
|
42
|
|
|
* @var string |
|
43
|
|
|
*/ |
|
44
|
|
|
private $base; |
|
45
|
|
|
|
|
46
|
|
|
/** |
|
47
|
|
|
* User-agent |
|
48
|
|
|
* @var string |
|
49
|
|
|
*/ |
|
50
|
|
|
private $userAgent; |
|
51
|
|
|
|
|
52
|
|
|
/** |
|
53
|
|
|
* Rule array |
|
54
|
|
|
* @var array |
|
55
|
|
|
*/ |
|
56
|
|
|
private $array = []; |
|
57
|
|
|
|
|
58
|
|
|
/** |
|
59
|
|
|
* Sub-directive Clean-param |
|
60
|
|
|
* @var CleanParamParser |
|
61
|
|
|
*/ |
|
62
|
|
|
private $cleanParam; |
|
63
|
|
|
|
|
64
|
|
|
/** |
|
65
|
|
|
* Sub-directive Host |
|
66
|
|
|
* @var HostParser |
|
67
|
|
|
*/ |
|
68
|
|
|
private $host; |
|
69
|
|
|
|
|
70
|
|
|
/** |
|
71
|
|
|
* DisAllow constructor |
|
72
|
|
|
* |
|
73
|
|
|
* @param string $base |
|
74
|
|
|
* @param string $userAgent |
|
75
|
|
|
* @param string $directive |
|
76
|
|
|
*/ |
|
77
|
|
|
public function __construct($base, $userAgent, $directive) |
|
78
|
|
|
{ |
|
79
|
|
|
$this->base = $base; |
|
80
|
|
|
$this->userAgent = $userAgent; |
|
81
|
|
|
$this->directive = $this->validateDirective($directive, self::DIRECTIVE); |
|
82
|
|
|
$this->cleanParam = new CleanParamParser(); |
|
83
|
|
|
$this->host = new HostParser(); |
|
84
|
|
|
} |
|
85
|
|
|
|
|
86
|
|
|
/** |
|
87
|
|
|
* Add |
|
88
|
|
|
* |
|
89
|
|
|
* @param string $line |
|
90
|
|
|
* @return bool |
|
91
|
|
|
*/ |
|
92
|
|
|
public function add($line) |
|
93
|
|
|
{ |
|
94
|
|
|
$pair = $this->generateRulePair($line, self::SUB_DIRECTIVES); |
|
95
|
|
|
switch ($pair['directive']) { |
|
96
|
|
|
case self::DIRECTIVE_CLEAN_PARAM: |
|
97
|
|
|
return $this->cleanParam->add($pair['value']); |
|
98
|
|
|
case self::DIRECTIVE_HOST: |
|
99
|
|
|
return $this->host->add($pair['value']); |
|
100
|
|
|
} |
|
101
|
|
|
return $this->addPath($line); |
|
102
|
|
|
} |
|
103
|
|
|
|
|
104
|
|
|
/** |
|
105
|
|
|
* Add plain path to allow/disallow |
|
106
|
|
|
* |
|
107
|
|
|
* @param string $rule |
|
108
|
|
|
* @return bool |
|
109
|
|
|
*/ |
|
110
|
|
|
protected function addPath($rule) |
|
111
|
|
|
{ |
|
112
|
|
|
if (isset($this->array['path']) && in_array($rule, $this->array['path'])) { |
|
113
|
|
|
return false; |
|
114
|
|
|
} |
|
115
|
|
|
$this->array['path'][] = $rule; |
|
116
|
|
|
return true; |
|
117
|
|
|
} |
|
118
|
|
|
|
|
119
|
|
|
/** |
|
120
|
|
|
* Check |
|
121
|
|
|
* |
|
122
|
|
|
* @param string $url |
|
123
|
|
|
* @return bool |
|
124
|
|
|
*/ |
|
125
|
|
|
public function check($url) |
|
126
|
|
|
{ |
|
127
|
|
|
$path = $this->getPath($url); |
|
128
|
|
|
return ($path === false) ? false : ( |
|
129
|
|
|
$this->checkPath($path, isset($this->array['path']) ? $this->array['path'] : []) || |
|
130
|
|
|
$this->cleanParam->check($path) || |
|
131
|
|
|
$this->host->check($url) |
|
132
|
|
|
); |
|
133
|
|
|
} |
|
134
|
|
|
|
|
135
|
|
|
/** |
|
136
|
|
|
* Get path and query |
|
137
|
|
|
* |
|
138
|
|
|
* @param string $url |
|
139
|
|
|
* @return string |
|
140
|
|
|
* @throws Exceptions\ClientException |
|
141
|
|
|
*/ |
|
142
|
|
|
protected function getPath($url) |
|
143
|
|
|
{ |
|
144
|
|
|
// Encode |
|
145
|
|
|
$url = $this->urlEncode($url); |
|
146
|
|
View Code Duplication |
if (mb_stripos($url, '/') === 0) { |
|
|
|
|
|
|
147
|
|
|
// Strip fragments |
|
148
|
|
|
$url = mb_split('#', $url)[0]; |
|
149
|
|
|
return $url; |
|
150
|
|
|
} |
|
151
|
|
|
if (!$this->urlValidate($url)) { |
|
152
|
|
|
throw new Exceptions\ClientException('Invalid URL'); |
|
153
|
|
|
} |
|
154
|
|
|
$path = (($path = parse_url($url, PHP_URL_PATH)) === null) ? '/' : $path; |
|
155
|
|
|
$query = (($query = parse_url($url, PHP_URL_QUERY)) === null) ? '' : '?' . $query; |
|
156
|
|
|
return $path . $query; |
|
157
|
|
|
} |
|
158
|
|
|
|
|
159
|
|
|
/** |
|
160
|
|
|
* Rule array |
|
161
|
|
|
* |
|
162
|
|
|
* @return array |
|
163
|
|
|
*/ |
|
164
|
|
|
public function getRules() |
|
165
|
|
|
{ |
|
166
|
|
|
$result = array_merge( |
|
167
|
|
|
$this->array, |
|
168
|
|
|
$this->cleanParam->getRules(), |
|
169
|
|
|
$this->host->getRules() |
|
170
|
|
|
); |
|
171
|
|
|
return empty($result) ? [] : [$this->directive => $result]; |
|
172
|
|
|
} |
|
173
|
|
|
|
|
174
|
|
|
/** |
|
175
|
|
|
* Render |
|
176
|
|
|
* |
|
177
|
|
|
* @return string[] |
|
178
|
|
|
*/ |
|
179
|
|
|
public function render() |
|
180
|
|
|
{ |
|
181
|
|
|
$result = []; |
|
182
|
|
|
$render = array_merge( |
|
183
|
|
|
$this->array, |
|
184
|
|
|
$this->cleanParam->render(), |
|
185
|
|
|
$this->host->render() |
|
186
|
|
|
); |
|
187
|
|
|
foreach ($render as $value) { |
|
188
|
|
|
if (is_array($value)) { |
|
189
|
|
|
foreach ($value as $path) { |
|
190
|
|
|
$result[] = $this->directive . ':' . $path; |
|
191
|
|
|
} |
|
192
|
|
|
continue; |
|
193
|
|
|
} |
|
194
|
|
|
$result[] = $this->directive . ':' . $value; |
|
195
|
|
|
} |
|
196
|
|
|
return $result; |
|
197
|
|
|
} |
|
198
|
|
|
} |
|
199
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.