1
|
|
|
<?php |
2
|
|
|
namespace vipnytt\RobotsTxtParser; |
3
|
|
|
|
4
|
|
|
use DateTime; |
5
|
|
|
use GuzzleHttp; |
6
|
|
|
use vipnytt\RobotsTxtParser\Client; |
7
|
|
|
|
8
|
|
|
/** |
9
|
|
|
* Class Request |
10
|
|
|
* |
11
|
|
|
* @package vipnytt\RobotsTxtParser |
12
|
|
|
*/ |
13
|
|
|
class Request extends Client |
14
|
|
|
{ |
15
|
|
|
const GUZZLE_HTTP_CONFIG = [ |
16
|
|
|
'allow_redirects' => [ |
17
|
|
|
'max' => self::MAX_REDIRECTS, |
18
|
|
|
'referer' => true, |
19
|
|
|
'strict' => true, |
20
|
|
|
], |
21
|
|
|
'decode_content' => true, |
22
|
|
|
'headers' => [ |
23
|
|
|
'Accept' => 'text/plain;q=1.0, text/*;q=0.8, */*;q=0.1', |
24
|
|
|
'Accept-Charset' => 'utf-8;q=1.0, *;q=0.1', |
25
|
|
|
'Accept-Encoding' => 'identity;q=1.0, *;q=0.1', |
26
|
|
|
'User-Agent' => 'RobotsTxtParser-VIPnytt/2.0 (+https://github.com/VIPnytt/RobotsTxtParser/blob/master/README.md)', |
27
|
|
|
], |
28
|
|
|
'http_errors' => false, |
29
|
|
|
'timeout' => 60, |
30
|
|
|
'verify' => true, |
31
|
|
|
]; |
32
|
|
|
|
33
|
|
|
/** |
34
|
|
|
* Request timestamp |
35
|
|
|
* @var int |
36
|
|
|
*/ |
37
|
|
|
protected $time; |
38
|
|
|
|
39
|
|
|
/** |
40
|
|
|
* Cache-Control max-age |
41
|
|
|
* @var int |
42
|
|
|
*/ |
43
|
|
|
protected $maxAge; |
44
|
|
|
|
45
|
|
|
/** |
46
|
|
|
* HTTP Status code |
47
|
|
|
* @var int |
48
|
|
|
*/ |
49
|
|
|
protected $statusCode; |
50
|
|
|
|
51
|
|
|
/** |
52
|
|
|
* Robots.txt contents |
53
|
|
|
* @var string |
54
|
|
|
*/ |
55
|
|
|
protected $contents; |
56
|
|
|
|
57
|
|
|
/** |
58
|
|
|
* Robots.txt character encoding |
59
|
|
|
* @var string |
60
|
|
|
*/ |
61
|
|
|
protected $encoding; |
62
|
|
|
|
63
|
|
|
/** |
64
|
|
|
* Request constructor. |
65
|
|
|
* |
66
|
|
|
* @param string $baseUri |
67
|
|
|
* @param array $guzzleConfig |
68
|
|
|
* @param int|null $byteLimit |
69
|
|
|
*/ |
70
|
|
|
public function __construct($baseUri, array $guzzleConfig = [], $byteLimit = self::BYTE_LIMIT) |
71
|
|
|
{ |
72
|
|
|
$baseUri = $this->urlBase($this->urlEncode($baseUri)); |
73
|
|
|
try { |
74
|
|
|
$client = new GuzzleHttp\Client( |
75
|
|
|
array_merge_recursive( |
76
|
|
|
self::GUZZLE_HTTP_CONFIG, |
77
|
|
|
$guzzleConfig, |
78
|
|
|
[ |
79
|
|
|
'base_uri' => $baseUri, |
80
|
|
|
] |
81
|
|
|
) |
82
|
|
|
); |
83
|
|
|
$response = $client->request('GET', self::PATH); |
84
|
|
|
$this->time = time(); |
85
|
|
|
$this->statusCode = $response->getStatusCode(); |
86
|
|
|
$this->contents = $response->getBody()->getContents(); |
87
|
|
|
$this->encoding = $this->headerEncoding($response->getHeader('content-type')); |
88
|
|
|
$this->maxAge = $this->headerMaxAge($response->getHeader('cache-control')); |
89
|
|
|
} catch (GuzzleHttp\Exception\TransferException $e) { |
90
|
|
|
$this->statusCode = 523; |
91
|
|
|
$this->contents = ''; |
92
|
|
|
$this->encoding = self::ENCODING; |
93
|
|
|
$this->maxAge = 0; |
94
|
|
|
} |
95
|
|
|
parent::__construct($baseUri, $this->statusCode, $this->contents, $this->encoding, $byteLimit); |
96
|
|
|
} |
97
|
|
|
|
98
|
|
|
/** |
99
|
|
|
* Content-Type encoding HTTP header |
100
|
|
|
* |
101
|
|
|
* @param array $headers |
102
|
|
|
* @return string |
103
|
|
|
*/ |
104
|
|
|
protected function headerEncoding(array $headers) |
105
|
|
|
{ |
106
|
|
|
if (($value = $this->parseHeader($headers, 'charset', ';')) !== false) { |
107
|
|
|
return $value; |
108
|
|
|
} |
109
|
|
|
return self::ENCODING; |
110
|
|
|
} |
111
|
|
|
|
112
|
|
|
/** |
113
|
|
|
* Client header |
114
|
|
|
* |
115
|
|
|
* @param array $headers |
116
|
|
|
* @param string $part |
117
|
|
|
* @param string $delimiter |
118
|
|
|
* @return string|false |
119
|
|
|
*/ |
120
|
|
|
protected function parseHeader(array $headers, $part, $delimiter = ";") |
121
|
|
|
{ |
122
|
|
|
foreach ($headers as $header) { |
123
|
|
|
$split = array_map('trim', mb_split($delimiter, $header)); |
124
|
|
|
foreach ($split as $string) { |
125
|
|
View Code Duplication |
if (mb_stripos($string, $part . '=') === 0) { |
|
|
|
|
126
|
|
|
return mb_split('=', $string, 2)[1]; |
127
|
|
|
} |
128
|
|
|
} |
129
|
|
|
} |
130
|
|
|
return false; |
131
|
|
|
} |
132
|
|
|
|
133
|
|
|
/** |
134
|
|
|
* Cache-Control max-age HTTP header |
135
|
|
|
* |
136
|
|
|
* @param array $headers |
137
|
|
|
* @return int |
138
|
|
|
*/ |
139
|
|
|
protected function headerMaxAge(array $headers) |
140
|
|
|
{ |
141
|
|
|
if (($value = $this->parseHeader($headers, 'max-age', ',')) !== false) { |
142
|
|
|
return intval($value); |
143
|
|
|
} |
144
|
|
|
return 0; |
145
|
|
|
} |
146
|
|
|
|
147
|
|
|
/** |
148
|
|
|
* Base URI |
149
|
|
|
* |
150
|
|
|
* @return string |
151
|
|
|
*/ |
152
|
|
|
public function getBaseUri() |
153
|
|
|
{ |
154
|
|
|
return $this->base; |
155
|
|
|
} |
156
|
|
|
|
157
|
|
|
/** |
158
|
|
|
* Status code |
159
|
|
|
* |
160
|
|
|
* @return int |
161
|
|
|
*/ |
162
|
|
|
public function getStatusCode() |
163
|
|
|
{ |
164
|
|
|
return $this->statusCode; |
165
|
|
|
} |
166
|
|
|
|
167
|
|
|
/** |
168
|
|
|
* URL content |
169
|
|
|
* |
170
|
|
|
* @return string |
171
|
|
|
*/ |
172
|
|
|
public function getContents() |
173
|
|
|
{ |
174
|
|
|
return $this->contents; |
175
|
|
|
} |
176
|
|
|
|
177
|
|
|
/** |
178
|
|
|
* Encoding |
179
|
|
|
* |
180
|
|
|
* @return string |
181
|
|
|
*/ |
182
|
|
|
public function getEncoding() |
183
|
|
|
{ |
184
|
|
|
return $this->encoding; |
185
|
|
|
} |
186
|
|
|
|
187
|
|
|
/** |
188
|
|
|
* Next update timestamp |
189
|
|
|
* |
190
|
|
|
* @return int |
191
|
|
|
*/ |
192
|
|
|
public function nextUpdate() |
193
|
|
|
{ |
194
|
|
|
return $this->time + self::CACHE_TIME; |
195
|
|
|
} |
196
|
|
|
|
197
|
|
|
/** |
198
|
|
|
* Valid until timestamp |
199
|
|
|
* |
200
|
|
|
* @return \DateTime|false |
201
|
|
|
*/ |
202
|
|
|
public function validUntil() |
203
|
|
|
{ |
204
|
|
|
return $this->time + max(self::CACHE_TIME, $this->maxAge); |
205
|
|
|
} |
206
|
|
|
} |
207
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.