1
|
|
|
<?php |
2
|
|
|
namespace vipnytt\RobotsTxtParser; |
3
|
|
|
|
4
|
|
|
use DateTime; |
5
|
|
|
use GuzzleHttp; |
6
|
|
|
use vipnytt\RobotsTxtParser\Client; |
7
|
|
|
use vipnytt\RobotsTxtParser\Parser\RobotsTxtInterface; |
8
|
|
|
|
9
|
|
|
/** |
10
|
|
|
* Class Download |
11
|
|
|
* |
12
|
|
|
* @package vipnytt\RobotsTxtParser |
13
|
|
|
*/ |
14
|
|
|
class Download implements RobotsTxtInterface |
15
|
|
|
{ |
16
|
|
|
/** |
17
|
|
|
* Base uri |
18
|
|
|
* @var string |
19
|
|
|
*/ |
20
|
|
|
protected $baseUri; |
21
|
|
|
|
22
|
|
|
/** |
23
|
|
|
* Download time |
24
|
|
|
* @var int |
25
|
|
|
*/ |
26
|
|
|
protected $time; |
27
|
|
|
|
28
|
|
|
/** |
29
|
|
|
* Robots.txt max-age |
30
|
|
|
* @var int|null |
31
|
|
|
*/ |
32
|
|
|
protected $maxAge = null; |
33
|
|
|
|
34
|
|
|
/** |
35
|
|
|
* HTTP Status code |
36
|
|
|
* @var int |
37
|
|
|
*/ |
38
|
|
|
protected $statusCode; |
39
|
|
|
|
40
|
|
|
/** |
41
|
|
|
* Robots.txt contents |
42
|
|
|
* @var string |
43
|
|
|
*/ |
44
|
|
|
protected $contents; |
45
|
|
|
|
46
|
|
|
/** |
47
|
|
|
* Robots.txt character encoding |
48
|
|
|
* @var string |
49
|
|
|
*/ |
50
|
|
|
protected $encoding; |
51
|
|
|
|
52
|
|
|
/** |
53
|
|
|
* Parser client class |
54
|
|
|
* @var Client |
55
|
|
|
*/ |
56
|
|
|
protected $parserClient; |
57
|
|
|
|
58
|
|
|
/** |
59
|
|
|
* Download constructor. |
60
|
|
|
* |
61
|
|
|
* @param string $baseUri |
62
|
|
|
* @param array $guzzleConfig |
63
|
|
|
*/ |
64
|
|
|
public function __construct($baseUri, array $guzzleConfig = []) |
65
|
|
|
{ |
66
|
|
|
$this->baseUri = $baseUri; |
67
|
|
|
try { |
68
|
|
|
$client = new GuzzleHttp\Client( |
69
|
|
|
array_merge_recursive( |
70
|
|
|
[ |
71
|
|
|
'allow_redirects' => [ |
72
|
|
|
'max' => self::MAX_REDIRECTS, |
73
|
|
|
'referer' => true, |
74
|
|
|
'strict' => true, |
75
|
|
|
'track_redirects' => true, |
76
|
|
|
], |
77
|
|
|
'base_uri' => $baseUri, |
78
|
|
|
'headers' => [ |
79
|
|
|
'Accept' => 'text/plain;q=1.0, text/*;q=0.8, */*;q=0.1', |
80
|
|
|
'Accept-Charset' => 'utf-8;q=1.0, *;q=0.1', |
81
|
|
|
'Accept-Encoding' => 'identity;q=1.0, *;q=0.1', |
82
|
|
|
'User-Agent' => 'RobotsTxtParser-VIPnytt/1.0 (+https://github.com/VIPnytt/RobotsTxtParser/blob/master/README.md)', |
83
|
|
|
], |
84
|
|
|
'http_errors' => false, |
85
|
|
|
'timeout' => 60, |
86
|
|
|
'verify' => true, |
87
|
|
|
], |
88
|
|
|
$guzzleConfig |
89
|
|
|
) |
90
|
|
|
); |
91
|
|
|
$response = $client->request('GET', '/robots.txt'); |
92
|
|
|
$this->time = time(); |
93
|
|
|
$this->statusCode = $response->getStatusCode(); |
94
|
|
|
$this->contents = $response->getBody()->getContents(); |
95
|
|
|
$this->encoding = $this->headerEncoding($response->getHeader('content-type')); |
96
|
|
|
} catch (GuzzleHttp\Exception\TransferException $e) { |
97
|
|
|
$this->statusCode = 523; |
98
|
|
|
$this->contents = ''; |
99
|
|
|
$this->encoding = self::ENCODING; |
100
|
|
|
} |
101
|
|
|
} |
102
|
|
|
|
103
|
|
|
/** |
104
|
|
|
* HTTP header encoding |
105
|
|
|
* |
106
|
|
|
* @param array $headers |
107
|
|
|
* @return string |
108
|
|
|
*/ |
109
|
|
|
protected function headerEncoding(array $headers) |
110
|
|
|
{ |
111
|
|
|
foreach ($headers as $header) { |
112
|
|
|
$split = array_map('trim', mb_split(';', $header)); |
113
|
|
|
foreach ($split as $string) { |
114
|
|
|
if (mb_stripos($string, 'charset=') === 0) { |
115
|
|
|
$encoding = mb_split('=', $string, 2)[1]; |
116
|
|
|
if (in_array(mb_strtolower($encoding), array_map('mb_strtolower', mb_list_encodings()))) { |
117
|
|
|
return $encoding; |
118
|
|
|
} |
119
|
|
|
} |
120
|
|
|
} |
121
|
|
|
} |
122
|
|
|
return $this->detectEncoding(); |
123
|
|
|
} |
124
|
|
|
|
125
|
|
|
/** |
126
|
|
|
* Manually detect encoding |
127
|
|
|
* |
128
|
|
|
* @return string |
129
|
|
|
*/ |
130
|
|
|
protected function detectEncoding() |
131
|
|
|
{ |
132
|
|
|
if (($encoding = mb_detect_encoding($this->getContents())) !== false) { |
133
|
|
|
return $encoding; |
134
|
|
|
} |
135
|
|
|
return self::ENCODING; |
136
|
|
|
} |
137
|
|
|
|
138
|
|
|
/** |
139
|
|
|
* URL content |
140
|
|
|
* |
141
|
|
|
* @return string |
142
|
|
|
*/ |
143
|
|
|
public function getContents() |
144
|
|
|
{ |
145
|
|
|
return $this->contents; |
146
|
|
|
} |
147
|
|
|
|
148
|
|
|
/** |
149
|
|
|
* Parser client |
150
|
|
|
* |
151
|
|
|
* @param int|null $byteLimit |
152
|
|
|
* @return Client |
153
|
|
|
*/ |
154
|
|
|
public function parserClient($byteLimit = self::BYTE_LIMIT) |
155
|
|
|
{ |
156
|
|
|
if (!is_a($this->parserClient, 'Client')) { |
157
|
|
|
$this->parserClient = new Client($this->baseUri, $this->getStatusCode(), $this->getContents(), $this->getEncoding(), $byteLimit); |
158
|
|
|
} |
159
|
|
|
if (!is_a($this->parserClient, 'Client')) { |
160
|
|
|
exit; |
|
|
|
|
161
|
|
|
} |
162
|
|
|
return $this->parserClient; |
163
|
|
|
} |
164
|
|
|
|
165
|
|
|
/** |
166
|
|
|
* Status code |
167
|
|
|
* |
168
|
|
|
* @return int |
169
|
|
|
*/ |
170
|
|
|
public function getStatusCode() |
171
|
|
|
{ |
172
|
|
|
return $this->statusCode; |
173
|
|
|
} |
174
|
|
|
|
175
|
|
|
/** |
176
|
|
|
* Encoding |
177
|
|
|
* |
178
|
|
|
* @return string |
179
|
|
|
*/ |
180
|
|
|
public function getEncoding() |
181
|
|
|
{ |
182
|
|
|
return $this->encoding; |
183
|
|
|
} |
184
|
|
|
|
185
|
|
|
/** |
186
|
|
|
* Next update timestamp |
187
|
|
|
* |
188
|
|
|
* @return \DateTime|false |
189
|
|
|
*/ |
190
|
|
|
public function nextUpdate() |
191
|
|
|
{ |
192
|
|
|
$dateTime = new DateTime; |
193
|
|
|
$dateTime->setTimestamp($this->time + self::CACHE_TIME); |
194
|
|
|
return $dateTime; |
195
|
|
|
} |
196
|
|
|
|
197
|
|
|
/** |
198
|
|
|
* Valid until timestamp |
199
|
|
|
* |
200
|
|
|
* @return \DateTime|false |
201
|
|
|
*/ |
202
|
|
|
public function validUntil() |
203
|
|
|
{ |
204
|
|
|
$dateTime = new DateTime; |
205
|
|
|
$dateTime->setTimestamp($this->time + max(self::CACHE_TIME, is_int($this->maxAge) ? $this->maxAge : 0)); |
206
|
|
|
return $dateTime; |
207
|
|
|
} |
208
|
|
|
} |
209
|
|
|
|
An exit expression should only be used in rare cases. For example, if you write a short command line script.
In most cases however, using an
exit
expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.