1
|
|
|
<?php |
2
|
|
|
namespace vipnytt\RobotsTxtParser; |
3
|
|
|
|
4
|
|
|
use DateTime; |
5
|
|
|
use GuzzleHttp; |
6
|
|
|
use vipnytt\RobotsTxtParser\Client; |
7
|
|
|
use vipnytt\RobotsTxtParser\Parser\RobotsTxtInterface; |
8
|
|
|
|
9
|
|
|
/** |
10
|
|
|
* Class Download |
11
|
|
|
* |
12
|
|
|
* @package vipnytt\RobotsTxtParser |
13
|
|
|
*/ |
14
|
|
|
class Download implements RobotsTxtInterface |
15
|
|
|
{ |
16
|
|
|
/** |
17
|
|
|
* Base uri |
18
|
|
|
* @var string |
19
|
|
|
*/ |
20
|
|
|
protected $baseUri; |
21
|
|
|
|
22
|
|
|
/** |
23
|
|
|
* Download timestamp |
24
|
|
|
* @var int |
25
|
|
|
*/ |
26
|
|
|
protected $time; |
27
|
|
|
|
28
|
|
|
/** |
29
|
|
|
* Cache-Control max-age |
30
|
|
|
* @var int |
31
|
|
|
*/ |
32
|
|
|
protected $maxAge; |
33
|
|
|
|
34
|
|
|
/** |
35
|
|
|
* HTTP Status code |
36
|
|
|
* @var int |
37
|
|
|
*/ |
38
|
|
|
protected $statusCode; |
39
|
|
|
|
40
|
|
|
/** |
41
|
|
|
* Robots.txt contents |
42
|
|
|
* @var string |
43
|
|
|
*/ |
44
|
|
|
protected $contents; |
45
|
|
|
|
46
|
|
|
/** |
47
|
|
|
* Robots.txt character encoding |
48
|
|
|
* @var string |
49
|
|
|
*/ |
50
|
|
|
protected $encoding; |
51
|
|
|
|
52
|
|
|
/** |
53
|
|
|
* Parser client class |
54
|
|
|
* @var Client |
55
|
|
|
*/ |
56
|
|
|
protected $parserClient; |
57
|
|
|
|
58
|
|
|
/** |
59
|
|
|
* Download constructor. |
60
|
|
|
* |
61
|
|
|
* @param string $baseUri |
62
|
|
|
* @param array $guzzleConfig |
63
|
|
|
*/ |
64
|
|
|
public function __construct($baseUri, array $guzzleConfig = []) |
65
|
|
|
{ |
66
|
|
|
$this->baseUri = $baseUri; |
67
|
|
|
try { |
68
|
|
|
$client = new GuzzleHttp\Client( |
69
|
|
|
array_merge_recursive( |
70
|
|
|
[ |
71
|
|
|
'allow_redirects' => [ |
72
|
|
|
'max' => self::MAX_REDIRECTS, |
73
|
|
|
'referer' => true, |
74
|
|
|
'strict' => true, |
75
|
|
|
], |
76
|
|
|
'base_uri' => $baseUri, |
77
|
|
|
'decode_content' => true, |
78
|
|
|
'headers' => [ |
79
|
|
|
'Accept' => 'text/plain;q=1.0, text/*;q=0.8, */*;q=0.1', |
80
|
|
|
'Accept-Charset' => 'utf-8;q=1.0, *;q=0.1', |
81
|
|
|
'Accept-Encoding' => 'identity;q=1.0, *;q=0.1', |
82
|
|
|
'User-Agent' => 'RobotsTxtParser-VIPnytt/1.0 (+https://github.com/VIPnytt/RobotsTxtParser/blob/master/README.md)', |
83
|
|
|
], |
84
|
|
|
'http_errors' => false, |
85
|
|
|
'timeout' => 60, |
86
|
|
|
'verify' => true, |
87
|
|
|
], |
88
|
|
|
$guzzleConfig |
89
|
|
|
) |
90
|
|
|
); |
91
|
|
|
$response = $client->request('GET', '/robots.txt'); |
92
|
|
|
$this->time = time(); |
93
|
|
|
$this->statusCode = $response->getStatusCode(); |
94
|
|
|
$this->contents = $response->getBody()->getContents(); |
95
|
|
|
$this->encoding = $this->headerEncoding($response->getHeader('content-type')); |
96
|
|
|
$this->maxAge = $this->headerMaxAge($response->getHeader('cache-control')); |
97
|
|
|
} catch (GuzzleHttp\Exception\TransferException $e) { |
98
|
|
|
$this->statusCode = 523; |
99
|
|
|
$this->contents = ''; |
100
|
|
|
$this->encoding = self::ENCODING; |
101
|
|
|
$this->maxAge = 0; |
102
|
|
|
} |
103
|
|
|
} |
104
|
|
|
|
105
|
|
|
/** |
106
|
|
|
* Content-Type encoding HTTP header |
107
|
|
|
* |
108
|
|
|
* @param array $headers |
109
|
|
|
* @return string |
110
|
|
|
*/ |
111
|
|
View Code Duplication |
protected function headerEncoding(array $headers) |
|
|
|
|
112
|
|
|
{ |
113
|
|
|
foreach ($headers as $header) { |
114
|
|
|
$split = array_map('trim', mb_split(';', $header)); |
115
|
|
|
foreach ($split as $string) { |
116
|
|
|
if (mb_stripos($string, 'charset=') === 0) { |
117
|
|
|
return mb_split('=', $string, 2)[1]; |
118
|
|
|
} |
119
|
|
|
} |
120
|
|
|
} |
121
|
|
|
return self::ENCODING; |
122
|
|
|
} |
123
|
|
|
|
124
|
|
|
/** |
125
|
|
|
* Cache-Control max-age HTTP header |
126
|
|
|
* |
127
|
|
|
* @param array $headers |
128
|
|
|
* @return int |
129
|
|
|
*/ |
130
|
|
View Code Duplication |
protected function headerMaxAge(array $headers) |
|
|
|
|
131
|
|
|
{ |
132
|
|
|
foreach ($headers as $header) { |
133
|
|
|
$split = array_map('trim', mb_split(',', $header)); |
134
|
|
|
foreach ($split as $string) { |
135
|
|
|
if (mb_stripos($string, 'max-age=') === 0) { |
136
|
|
|
return intval(mb_split('=', $string, 2)[1]); |
137
|
|
|
} |
138
|
|
|
} |
139
|
|
|
} |
140
|
|
|
return 0; |
141
|
|
|
} |
142
|
|
|
|
143
|
|
|
/** |
144
|
|
|
* Parser client |
145
|
|
|
* |
146
|
|
|
* @param int|null $byteLimit |
147
|
|
|
* @return Client |
148
|
|
|
*/ |
149
|
|
|
public function parserClient($byteLimit = self::BYTE_LIMIT) |
150
|
|
|
{ |
151
|
|
|
if (!is_object($this->parserClient)) { |
152
|
|
|
$this->parserClient = new Client($this->baseUri, $this->getStatusCode(), $this->getContents(), $this->getEncoding(), $byteLimit); |
153
|
|
|
} |
154
|
|
|
return $this->parserClient; |
155
|
|
|
} |
156
|
|
|
|
157
|
|
|
/** |
158
|
|
|
* Status code |
159
|
|
|
* |
160
|
|
|
* @return int |
161
|
|
|
*/ |
162
|
|
|
public function getStatusCode() |
163
|
|
|
{ |
164
|
|
|
return $this->statusCode; |
165
|
|
|
} |
166
|
|
|
|
167
|
|
|
/** |
168
|
|
|
* URL content |
169
|
|
|
* |
170
|
|
|
* @return string |
171
|
|
|
*/ |
172
|
|
|
public function getContents() |
173
|
|
|
{ |
174
|
|
|
return $this->contents; |
175
|
|
|
} |
176
|
|
|
|
177
|
|
|
/** |
178
|
|
|
* Encoding |
179
|
|
|
* |
180
|
|
|
* @return string |
181
|
|
|
*/ |
182
|
|
|
public function getEncoding() |
183
|
|
|
{ |
184
|
|
|
return $this->encoding; |
185
|
|
|
} |
186
|
|
|
|
187
|
|
|
/** |
188
|
|
|
* Next update timestamp |
189
|
|
|
* |
190
|
|
|
* @return \DateTime|false |
191
|
|
|
*/ |
192
|
|
|
public function nextUpdate() |
193
|
|
|
{ |
194
|
|
|
$dateTime = new DateTime; |
195
|
|
|
$dateTime->setTimestamp($this->time + self::CACHE_TIME); |
196
|
|
|
return $dateTime; |
197
|
|
|
} |
198
|
|
|
|
199
|
|
|
/** |
200
|
|
|
* Valid until timestamp |
201
|
|
|
* |
202
|
|
|
* @return \DateTime|false |
203
|
|
|
*/ |
204
|
|
|
public function validUntil() |
205
|
|
|
{ |
206
|
|
|
$dateTime = new DateTime; |
207
|
|
|
$dateTime->setTimestamp($this->time + max(self::CACHE_TIME, $this->maxAge)); |
208
|
|
|
return $dateTime; |
209
|
|
|
} |
210
|
|
|
} |
211
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.