1
|
|
|
<?php |
2
|
|
|
namespace vipnytt\RobotsTxtParser; |
3
|
|
|
|
4
|
|
|
use GuzzleHttp; |
5
|
|
|
use vipnytt\RobotsTxtParser\Client; |
6
|
|
|
use vipnytt\RobotsTxtParser\Parser\RobotsTxtInterface; |
7
|
|
|
|
8
|
|
|
/** |
9
|
|
|
* Class Download |
10
|
|
|
* |
11
|
|
|
* @package vipnytt\RobotsTxtParser |
12
|
|
|
*/ |
13
|
|
View Code Duplication |
class Download implements RobotsTxtInterface |
|
|
|
|
14
|
|
|
{ |
15
|
|
|
/** |
16
|
|
|
* Base uri |
17
|
|
|
* @var string |
18
|
|
|
*/ |
19
|
|
|
protected $baseUri; |
20
|
|
|
|
21
|
|
|
/** |
22
|
|
|
* HTTP Status code |
23
|
|
|
* @var int |
24
|
|
|
*/ |
25
|
|
|
protected $statusCode; |
26
|
|
|
|
27
|
|
|
/** |
28
|
|
|
* Robots.txt contents |
29
|
|
|
* @var string |
30
|
|
|
*/ |
31
|
|
|
protected $contents; |
32
|
|
|
|
33
|
|
|
/** |
34
|
|
|
* Robots.txt character encoding |
35
|
|
|
* @var string |
36
|
|
|
*/ |
37
|
|
|
protected $encoding; |
38
|
|
|
|
39
|
|
|
/** |
40
|
|
|
* Download constructor. |
41
|
|
|
* |
42
|
|
|
* @param string $baseUri |
43
|
|
|
* @param array $guzzleConfig |
44
|
|
|
*/ |
45
|
|
|
public function __construct($baseUri, $guzzleConfig = []) |
46
|
|
|
{ |
47
|
|
|
$this->baseUri = $baseUri; |
48
|
|
|
try { |
49
|
|
|
$client = new GuzzleHttp\Client( |
50
|
|
|
array_merge_recursive( |
51
|
|
|
[ |
52
|
|
|
'allow_redirects' => [ |
53
|
|
|
'max' => self::MAX_REDIRECTS, |
54
|
|
|
'referer' => true, |
55
|
|
|
'strict' => true, |
56
|
|
|
'track_redirects' => true, |
57
|
|
|
], |
58
|
|
|
'base_uri' => $baseUri, |
59
|
|
|
'headers' => [ |
60
|
|
|
'Accept' => 'text/plain;q=1.0, text/*;q=0.8, */*;q=0.1', |
61
|
|
|
'Accept-Charset' => 'utf-8;q=1.0, *;q=0.1', |
62
|
|
|
'Accept-Encoding' => 'identity;q=1.0, *;q=0.1', |
63
|
|
|
'User-Agent' => 'RobotsTxtParser-VIPnytt/1.0 (+https://github.com/VIPnytt/RobotsTxtParser/blob/master/README.md)', |
64
|
|
|
], |
65
|
|
|
'http_errors' => false, |
66
|
|
|
'timeout' => 60, |
67
|
|
|
'verify' => true, |
68
|
|
|
], |
69
|
|
|
$guzzleConfig |
70
|
|
|
) |
71
|
|
|
); |
72
|
|
|
$response = $client->request('GET', '/robots.txt'); |
73
|
|
|
$this->statusCode = $response->getStatusCode(); |
74
|
|
|
$this->contents = $response->getBody()->getContents(); |
75
|
|
|
$this->encoding = $this->headerEncoding($response->getHeader('content-type')[0]); |
76
|
|
|
} catch (GuzzleHttp\Exception\ConnectException $e) { |
77
|
|
|
$this->connectionIssue(); |
78
|
|
|
} |
79
|
|
|
} |
80
|
|
|
|
81
|
|
|
/** |
82
|
|
|
* HTTP header encoding |
83
|
|
|
* |
84
|
|
|
* @param $header |
85
|
|
|
* @return string |
86
|
|
|
*/ |
87
|
|
|
protected function headerEncoding($header) |
88
|
|
|
{ |
89
|
|
|
$split = array_map('trim', mb_split(';', $header)); |
90
|
|
|
foreach ($split as $string) { |
91
|
|
|
if (mb_stripos($string, 'charset=') === 0) { |
92
|
|
|
$encoding = mb_split('=', $string, 2)[1]; |
93
|
|
|
if (in_array(mb_strtolower($encoding), array_map('mb_strtolower', mb_list_encodings()))) { |
94
|
|
|
return $encoding; |
95
|
|
|
} |
96
|
|
|
} |
97
|
|
|
} |
98
|
|
|
return $this->detectEncoding(); |
99
|
|
|
} |
100
|
|
|
|
101
|
|
|
/** |
102
|
|
|
* Manually detect encoding |
103
|
|
|
* |
104
|
|
|
* @return string |
105
|
|
|
*/ |
106
|
|
|
protected function detectEncoding() |
107
|
|
|
{ |
108
|
|
|
if (($encoding = mb_detect_encoding($this->getContents())) !== false) { |
109
|
|
|
return $encoding; |
110
|
|
|
} |
111
|
|
|
return self::ENCODING; |
112
|
|
|
} |
113
|
|
|
|
114
|
|
|
/** |
115
|
|
|
* URL content |
116
|
|
|
* |
117
|
|
|
* @return string |
118
|
|
|
*/ |
119
|
|
|
public function getContents() |
120
|
|
|
{ |
121
|
|
|
return $this->contents; |
122
|
|
|
} |
123
|
|
|
|
124
|
|
|
/** |
125
|
|
|
* Connection issue |
126
|
|
|
* |
127
|
|
|
* @return void |
128
|
|
|
*/ |
129
|
|
|
private function connectionIssue() |
130
|
|
|
{ |
131
|
|
|
$this->statusCode = 523; |
132
|
|
|
$this->contents = ''; |
133
|
|
|
$this->encoding = self::ENCODING; |
134
|
|
|
} |
135
|
|
|
|
136
|
|
|
/** |
137
|
|
|
* Parser client |
138
|
|
|
* |
139
|
|
|
* @param int|null $byteLimit |
140
|
|
|
* @return Client |
141
|
|
|
*/ |
142
|
|
|
public function parserClient($byteLimit = self::BYTE_LIMIT) |
143
|
|
|
{ |
144
|
|
|
return new Client($this->baseUri, $this->getStatusCode(), $this->getContents(), $this->getEncoding(), $byteLimit); |
145
|
|
|
} |
146
|
|
|
|
147
|
|
|
/** |
148
|
|
|
* Status code |
149
|
|
|
* |
150
|
|
|
* @return int |
151
|
|
|
*/ |
152
|
|
|
public function getStatusCode() |
153
|
|
|
{ |
154
|
|
|
return $this->statusCode; |
155
|
|
|
} |
156
|
|
|
|
157
|
|
|
/** |
158
|
|
|
* Encoding |
159
|
|
|
* |
160
|
|
|
* @return string |
161
|
|
|
*/ |
162
|
|
|
public function getEncoding() |
163
|
|
|
{ |
164
|
|
|
return $this->encoding; |
165
|
|
|
} |
166
|
|
|
} |
167
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.