|
1
|
|
|
<?php |
|
2
|
|
|
namespace vipnytt\RobotsTxtParser\Parser\Directives; |
|
3
|
|
|
|
|
4
|
|
|
use vipnytt\RobotsTxtParser\Client\Directives\UserAgentClient; |
|
5
|
|
|
use vipnytt\RobotsTxtParser\RobotsTxtInterface; |
|
6
|
|
|
use vipnytt\UserAgentParser as UAStringParser; |
|
7
|
|
|
|
|
8
|
|
|
/** |
|
9
|
|
|
* Class UserAgentParser |
|
10
|
|
|
* |
|
11
|
|
|
* @package vipnytt\RobotsTxtParser\Parser\Directives |
|
12
|
|
|
*/ |
|
13
|
|
|
class UserAgentParser implements ParserInterface, RobotsTxtInterface |
|
14
|
|
|
{ |
|
15
|
|
|
use DirectiveParserCommons; |
|
16
|
|
|
|
|
17
|
|
|
/** |
|
18
|
|
|
* Sub directives white list |
|
19
|
|
|
*/ |
|
20
|
|
|
const SUB_DIRECTIVES = [ |
|
21
|
|
|
self::DIRECTIVE_ALLOW, |
|
22
|
|
|
self::DIRECTIVE_CACHE_DELAY, |
|
23
|
|
|
self::DIRECTIVE_COMMENT, |
|
24
|
|
|
self::DIRECTIVE_CRAWL_DELAY, |
|
25
|
|
|
self::DIRECTIVE_DISALLOW, |
|
26
|
|
|
self::DIRECTIVE_REQUEST_RATE, |
|
27
|
|
|
self::DIRECTIVE_ROBOT_VERSION, |
|
28
|
|
|
self::DIRECTIVE_VISIT_TIME, |
|
29
|
|
|
]; |
|
30
|
|
|
|
|
31
|
|
|
/** |
|
32
|
|
|
* Directive |
|
33
|
|
|
*/ |
|
34
|
|
|
const DIRECTIVE = self::DIRECTIVE_USER_AGENT; |
|
35
|
|
|
|
|
36
|
|
|
/** |
|
37
|
|
|
* Base Uri |
|
38
|
|
|
* @var string |
|
39
|
|
|
*/ |
|
40
|
|
|
private $base; |
|
41
|
|
|
|
|
42
|
|
|
/** |
|
43
|
|
|
* User-agent handler |
|
44
|
|
|
* @var SubDirectiveHandler[] |
|
45
|
|
|
*/ |
|
46
|
|
|
private $handler = []; |
|
47
|
|
|
|
|
48
|
|
|
/** |
|
49
|
|
|
* User-agent(s) |
|
50
|
|
|
* @var string[] |
|
51
|
|
|
*/ |
|
52
|
|
|
private $userAgent = [self::USER_AGENT]; |
|
53
|
|
|
|
|
54
|
|
|
/** |
|
55
|
|
|
* User-agent client cache |
|
56
|
|
|
* @var UserAgentClient |
|
57
|
|
|
*/ |
|
58
|
|
|
private $client; |
|
59
|
|
|
|
|
60
|
|
|
/** |
|
61
|
|
|
* UserAgent constructor. |
|
62
|
|
|
* |
|
63
|
|
|
* @param string $base |
|
64
|
|
|
*/ |
|
65
|
|
|
public function __construct($base) |
|
66
|
|
|
{ |
|
67
|
|
|
$this->base = $base; |
|
68
|
|
|
$this->set(); |
|
69
|
|
|
} |
|
70
|
|
|
|
|
71
|
|
|
/** |
|
72
|
|
|
* Set new User-agent |
|
73
|
|
|
* |
|
74
|
|
|
* @param array $array |
|
75
|
|
|
* @return bool |
|
76
|
|
|
*/ |
|
77
|
|
|
public function set(array $array = [self::USER_AGENT]) |
|
78
|
|
|
{ |
|
79
|
|
|
$this->userAgent = array_map('mb_strtolower', $array); |
|
80
|
|
|
foreach ($this->userAgent as $userAgent) { |
|
81
|
|
|
if (!in_array($userAgent, array_keys($this->handler))) { |
|
82
|
|
|
$this->handler[$userAgent] = new SubDirectiveHandler($this->base, $userAgent); |
|
83
|
|
|
} |
|
84
|
|
|
} |
|
85
|
|
|
return true; |
|
86
|
|
|
} |
|
87
|
|
|
|
|
88
|
|
|
/** |
|
89
|
|
|
* Add |
|
90
|
|
|
* |
|
91
|
|
|
* @param string $line |
|
92
|
|
|
* @return bool |
|
93
|
|
|
*/ |
|
94
|
|
|
public function add($line) |
|
95
|
|
|
{ |
|
96
|
|
|
$result = []; |
|
97
|
|
|
$pair = $this->generateRulePair($line, self::SUB_DIRECTIVES); |
|
98
|
|
|
foreach ($this->userAgent as $userAgent) { |
|
99
|
|
|
switch ($pair['directive']) { |
|
100
|
|
View Code Duplication |
case self::DIRECTIVE_ALLOW: |
|
|
|
|
|
|
101
|
|
|
$result[] = $this->handler[$userAgent]->allow()->add($pair['value']); |
|
102
|
|
|
break; |
|
103
|
|
View Code Duplication |
case self::DIRECTIVE_CACHE_DELAY: |
|
|
|
|
|
|
104
|
|
|
$result[] = $this->handler[$userAgent]->cacheDelay()->add($pair['value']); |
|
105
|
|
|
break; |
|
106
|
|
View Code Duplication |
case self::DIRECTIVE_COMMENT: |
|
|
|
|
|
|
107
|
|
|
$result[] = $this->handler[$userAgent]->comment()->add($pair['value']); |
|
108
|
|
|
break; |
|
109
|
|
View Code Duplication |
case self::DIRECTIVE_CRAWL_DELAY: |
|
|
|
|
|
|
110
|
|
|
$result[] = $this->handler[$userAgent]->crawlDelay()->add($pair['value']); |
|
111
|
|
|
break; |
|
112
|
|
View Code Duplication |
case self::DIRECTIVE_DISALLOW: |
|
|
|
|
|
|
113
|
|
|
$result[] = $this->handler[$userAgent]->disallow()->add($pair['value']); |
|
114
|
|
|
break; |
|
115
|
|
View Code Duplication |
case self::DIRECTIVE_REQUEST_RATE: |
|
|
|
|
|
|
116
|
|
|
$result[] = $this->handler[$userAgent]->requestRate()->add($pair['value']); |
|
117
|
|
|
break; |
|
118
|
|
View Code Duplication |
case self::DIRECTIVE_ROBOT_VERSION: |
|
|
|
|
|
|
119
|
|
|
$result[] = $this->handler[$userAgent]->robotVersion()->add($pair['value']); |
|
120
|
|
|
break; |
|
121
|
|
View Code Duplication |
case self::DIRECTIVE_VISIT_TIME: |
|
|
|
|
|
|
122
|
|
|
$result[] = $this->handler[$userAgent]->visitTime()->add($pair['value']); |
|
123
|
|
|
break; |
|
124
|
|
|
} |
|
125
|
|
|
} |
|
126
|
|
|
return in_array(true, $result, true); |
|
127
|
|
|
} |
|
128
|
|
|
|
|
129
|
|
|
/** |
|
130
|
|
|
* Client |
|
131
|
|
|
* |
|
132
|
|
|
* @param string $userAgent |
|
133
|
|
|
* @param int|null $statusCode |
|
134
|
|
|
* @return UserAgentClient |
|
135
|
|
|
*/ |
|
136
|
|
|
public function client($userAgent = self::USER_AGENT, $statusCode = null) |
|
137
|
|
|
{ |
|
138
|
|
|
if (isset($this->client[$userAgent])) { |
|
139
|
|
|
return $this->client[$userAgent]; |
|
140
|
|
|
} |
|
141
|
|
|
$userAgent = mb_strtolower($userAgent); |
|
142
|
|
|
$userAgentParser = new UAStringParser($userAgent); |
|
143
|
|
|
if (($userAgentMatch = $userAgentParser->match($this->getUserAgents())) === false) { |
|
144
|
|
|
$userAgentMatch = self::USER_AGENT; |
|
145
|
|
|
} |
|
146
|
|
|
return $this->client[$userAgent] = new UserAgentClient($this->handler[$userAgentMatch], $this->base, $statusCode); |
|
147
|
|
|
} |
|
148
|
|
|
|
|
149
|
|
|
/** |
|
150
|
|
|
* User-agent list |
|
151
|
|
|
* |
|
152
|
|
|
* @return string[] |
|
153
|
|
|
*/ |
|
154
|
|
|
public function getUserAgents() |
|
155
|
|
|
{ |
|
156
|
|
|
return array_keys($this->handler); |
|
157
|
|
|
} |
|
158
|
|
|
|
|
159
|
|
|
/** |
|
160
|
|
|
* Rule array |
|
161
|
|
|
* |
|
162
|
|
|
* @return array |
|
163
|
|
|
*/ |
|
164
|
|
|
public function getRules() |
|
165
|
|
|
{ |
|
166
|
|
|
$result = []; |
|
167
|
|
|
foreach ($this->getUserAgents() as $userAgent) { |
|
168
|
|
|
$current = array_merge( |
|
169
|
|
|
$this->handler[$userAgent]->robotVersion()->getRules(), |
|
170
|
|
|
$this->handler[$userAgent]->visitTime()->getRules(), |
|
171
|
|
|
$this->handler[$userAgent]->disallow()->getRules(), |
|
172
|
|
|
$this->handler[$userAgent]->allow()->getRules(), |
|
173
|
|
|
$this->handler[$userAgent]->crawlDelay()->getRules(), |
|
174
|
|
|
$this->handler[$userAgent]->cacheDelay()->getRules(), |
|
175
|
|
|
$this->handler[$userAgent]->requestRate()->getRules(), |
|
176
|
|
|
$this->handler[$userAgent]->comment()->getRules() |
|
177
|
|
|
); |
|
178
|
|
|
if (!empty($current)) { |
|
179
|
|
|
$result[$userAgent] = $current; |
|
180
|
|
|
} |
|
181
|
|
|
} |
|
182
|
|
|
return empty($result) ? [] : [self::DIRECTIVE => $result]; |
|
183
|
|
|
} |
|
184
|
|
|
|
|
185
|
|
|
/** |
|
186
|
|
|
* Render |
|
187
|
|
|
* |
|
188
|
|
|
* @return string[] |
|
189
|
|
|
*/ |
|
190
|
|
|
public function render() |
|
191
|
|
|
{ |
|
192
|
|
|
$userAgents = $this->getUserAgents(); |
|
193
|
|
|
sort($userAgents); |
|
194
|
|
|
$result = []; |
|
195
|
|
|
foreach ($userAgents as $userAgent) { |
|
196
|
|
|
$current = array_merge( |
|
197
|
|
|
$this->handler[$userAgent]->robotVersion()->render(), |
|
198
|
|
|
$this->handler[$userAgent]->visitTime()->render(), |
|
199
|
|
|
$this->handler[$userAgent]->disallow()->render(), |
|
200
|
|
|
$this->handler[$userAgent]->allow()->render(), |
|
201
|
|
|
$this->handler[$userAgent]->crawlDelay()->render(), |
|
202
|
|
|
$this->handler[$userAgent]->cacheDelay()->render(), |
|
203
|
|
|
$this->handler[$userAgent]->requestRate()->render(), |
|
204
|
|
|
$this->handler[$userAgent]->comment()->render() |
|
205
|
|
|
); |
|
206
|
|
|
if (!empty($current)) { |
|
207
|
|
|
$result = array_merge($result, [self::DIRECTIVE . ':' . $userAgent], $current); |
|
208
|
|
|
} |
|
209
|
|
|
} |
|
210
|
|
|
return $result; |
|
211
|
|
|
} |
|
212
|
|
|
} |
|
213
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.