1
|
|
|
<?php |
2
|
|
|
namespace vipnytt\RobotsTxtParser; |
3
|
|
|
|
4
|
|
|
use PDO; |
5
|
|
|
use vipnytt\RobotsTxtParser\Exceptions\SQLException; |
6
|
|
|
use vipnytt\RobotsTxtParser\Parser\UrlParser; |
7
|
|
|
use vipnytt\RobotsTxtParser\SQL\SQLInterface; |
8
|
|
|
|
9
|
|
|
/** |
10
|
|
|
* Class Cache |
11
|
|
|
* |
12
|
|
|
* @package vipnytt\RobotsTxtParser |
13
|
|
|
*/ |
14
|
|
|
class Cache implements RobotsTxtInterface, SQLInterface |
15
|
|
|
{ |
16
|
|
|
use UrlParser; |
17
|
|
|
|
18
|
|
|
/** |
19
|
|
|
* Supported database drivers |
20
|
|
|
*/ |
21
|
|
|
const SUPPORTED_DRIVERS = [ |
22
|
|
|
self::DRIVER_MYSQL, |
23
|
|
|
]; |
24
|
|
|
|
25
|
|
|
/** |
26
|
|
|
* Database connection |
27
|
|
|
* @var PDO |
28
|
|
|
*/ |
29
|
|
|
private $pdo; |
30
|
|
|
|
31
|
|
|
/** |
32
|
|
|
* GuzzleHTTP config |
33
|
|
|
* @var array |
34
|
|
|
*/ |
35
|
|
|
private $guzzleConfig = []; |
36
|
|
|
|
37
|
|
|
/** |
38
|
|
|
* Byte limit |
39
|
|
|
* @var int|null |
40
|
|
|
*/ |
41
|
|
|
private $byteLimit = self::BYTE_LIMIT; |
42
|
|
|
|
43
|
|
|
/** |
44
|
|
|
* Client nextUpdate margin in seconds |
45
|
|
|
* @var int |
46
|
|
|
*/ |
47
|
|
|
private $clientUpdateMargin = 300; |
48
|
|
|
|
49
|
|
|
/** |
50
|
|
|
* PDO driver |
51
|
|
|
* @var string |
52
|
|
|
*/ |
53
|
|
|
private $driver; |
54
|
|
|
|
55
|
|
|
/** |
56
|
|
|
* Cache constructor. |
57
|
|
|
* |
58
|
|
|
* @param PDO $pdo |
59
|
|
|
* @param array $guzzleConfig |
60
|
|
|
* @param int|null $byteLimit |
61
|
|
|
*/ |
62
|
|
|
public function __construct(PDO $pdo, array $guzzleConfig = [], $byteLimit = self::BYTE_LIMIT) |
63
|
|
|
{ |
64
|
|
|
$this->pdo = $this->pdoInitialize($pdo); |
65
|
|
|
$this->guzzleConfig = $guzzleConfig; |
66
|
|
|
$this->byteLimit = $byteLimit; |
67
|
|
|
} |
68
|
|
|
|
69
|
|
|
/** |
70
|
|
|
* Initialize PDO connection |
71
|
|
|
* |
72
|
|
|
* @param PDO $pdo |
73
|
|
|
* @return PDO |
74
|
|
|
* @throws SQLException |
75
|
|
|
*/ |
76
|
|
|
private function pdoInitialize(PDO $pdo) |
77
|
|
|
{ |
78
|
|
|
if ($pdo->getAttribute(PDO::ATTR_ERRMODE) === PDO::ERRMODE_SILENT) { |
79
|
|
|
$pdo->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING); |
80
|
|
|
} |
81
|
|
|
$pdo->setAttribute(PDO::ATTR_CASE, PDO::CASE_NATURAL); |
82
|
|
|
$pdo->setAttribute(PDO::ATTR_ORACLE_NULLS, PDO::NULL_NATURAL); |
83
|
|
|
$pdo->exec('SET NAMES ' . self::SQL_ENCODING); |
84
|
|
|
$this->driver = $pdo->getAttribute(PDO::ATTR_DRIVER_NAME); |
85
|
|
|
if (!in_array($this->driver, self::SUPPORTED_DRIVERS)) { |
86
|
|
|
throw new SQLException('Unsupported database. ' . self::README_SQL_CACHE); |
87
|
|
|
} |
88
|
|
|
try { |
89
|
|
|
$pdo->query("SELECT 1 FROM robotstxt__cache0 LIMIT 1;"); |
90
|
|
|
} catch (\Exception $exception1) { |
91
|
|
|
try { |
92
|
|
|
$pdo->query(file_get_contents(__DIR__ . '/SQL/cache.sql')); |
93
|
|
|
} catch (\Exception $exception2) { |
94
|
|
|
throw new SQLException('Missing table `' . self::TABLE_CACHE . '`. Setup instructions: ' . self::README_SQL_CACHE); |
95
|
|
|
} |
96
|
|
|
} |
97
|
|
|
return $pdo; |
98
|
|
|
} |
99
|
|
|
|
100
|
|
|
/** |
101
|
|
|
* Parser client |
102
|
|
|
* |
103
|
|
|
* @param string $baseUri |
104
|
|
|
* @return TxtClient |
105
|
|
|
*/ |
106
|
|
|
public function client($baseUri) |
107
|
|
|
{ |
108
|
|
|
$base = $this->urlBase($this->urlEncode($baseUri)); |
109
|
|
|
$query = $this->pdo->prepare(<<<SQL |
110
|
|
|
SELECT |
111
|
|
|
content, |
112
|
|
|
statusCode, |
113
|
|
|
nextUpdate, |
114
|
|
|
worker, |
115
|
|
|
UNIX_TIMESTAMP() |
116
|
|
|
FROM robotstxt__cache0 |
117
|
|
|
WHERE base = :base; |
118
|
|
|
SQL |
119
|
|
|
); |
120
|
|
|
$query->bindParam(':base', $base, PDO::PARAM_STR); |
121
|
|
|
$query->execute(); |
122
|
|
|
if ($query->rowCount() > 0) { |
123
|
|
|
$row = $query->fetch(PDO::FETCH_ASSOC); |
124
|
|
|
$this->clockSyncCheck($row['UNIX_TIMESTAMP()']); |
125
|
|
|
if ($row['nextUpdate'] > ($row['UNIX_TIMESTAMP()'] - $this->clientUpdateMargin)) { |
126
|
|
|
$this->markAsActive($base, $row['worker']); |
127
|
|
|
return new TxtClient($base, $row['statusCode'], $row['content'], self::ENCODING, $this->byteLimit); |
128
|
|
|
} |
129
|
|
|
} |
130
|
|
|
$request = new UriClient($base, $this->guzzleConfig, $this->byteLimit); |
131
|
|
|
$this->push($request); |
132
|
|
|
$this->markAsActive($base); |
133
|
|
|
return new TxtClient($base, $request->getStatusCode(), $request->getContents(), self::ENCODING, $this->byteLimit); |
134
|
|
|
} |
135
|
|
|
|
136
|
|
|
/** |
137
|
|
|
* Clock sync check |
138
|
|
|
* |
139
|
|
|
* @param int $time |
140
|
|
|
* @throws SQLException |
141
|
|
|
*/ |
142
|
|
|
private function clockSyncCheck($time) |
143
|
|
|
{ |
144
|
|
|
if (abs(time() - $time) > 10) { |
145
|
|
|
throw new SQLException('`PHP server` and `SQL server` timestamps are out of sync. Please fix!'); |
146
|
|
|
} |
147
|
|
|
} |
148
|
|
|
|
149
|
|
|
/** |
150
|
|
|
* Mark robots.txt as active |
151
|
|
|
* |
152
|
|
|
* @param string $base |
153
|
|
|
* @param int|null $workerID |
154
|
|
|
* @return bool |
155
|
|
|
*/ |
156
|
|
View Code Duplication |
private function markAsActive($base, $workerID = 0) |
|
|
|
|
157
|
|
|
{ |
158
|
|
|
if ($workerID == 0) { |
159
|
|
|
$query = $this->pdo->prepare(<<<SQL |
160
|
|
|
UPDATE robotstxt__cache0 |
161
|
|
|
SET worker = NULL |
162
|
|
|
WHERE base = :base AND worker = 0; |
163
|
|
|
SQL |
164
|
|
|
); |
165
|
|
|
$query->bindParam(':base', $base, PDO::PARAM_STR); |
166
|
|
|
return $query->execute(); |
167
|
|
|
} |
168
|
|
|
return true; |
169
|
|
|
} |
170
|
|
|
|
171
|
|
|
/** |
172
|
|
|
* Update an robots.txt in the database |
173
|
|
|
* |
174
|
|
|
* @param UriClient $client |
175
|
|
|
* @return bool |
176
|
|
|
*/ |
177
|
|
|
private function push(UriClient $client) |
178
|
|
|
{ |
179
|
|
|
$base = $client->getBaseUri(); |
180
|
|
|
$statusCode = $client->getStatusCode(); |
181
|
|
|
$nextUpdate = $client->nextUpdate(); |
182
|
|
|
if ( |
183
|
|
|
$statusCode >= 500 && |
184
|
|
|
$statusCode < 600 && |
185
|
|
|
mb_stripos($base, 'http') === 0 && |
186
|
|
|
$this->displacePush($base, $nextUpdate) |
187
|
|
|
) { |
188
|
|
|
return true; |
189
|
|
|
} |
190
|
|
|
$validUntil = $client->validUntil(); |
191
|
|
|
$content = $client->render(); |
192
|
|
|
$query = $this->pdo->prepare(<<<SQL |
193
|
|
|
INSERT INTO robotstxt__cache0 (base, content, statusCode, validUntil, nextUpdate) |
194
|
|
|
VALUES (:base, :content, :statusCode, :validUntil, :nextUpdate) |
195
|
|
|
ON DUPLICATE KEY UPDATE content = :content, statusCode = :statusCode, validUntil = :validUntil, |
196
|
|
|
nextUpdate = :nextUpdate, worker = 0; |
197
|
|
|
SQL |
198
|
|
|
); |
199
|
|
|
$query->bindParam(':base', $base, PDO::PARAM_STR); |
200
|
|
|
$query->bindParam(':content', $content, PDO::PARAM_STR); |
201
|
|
|
$query->bindParam(':statusCode', $statusCode, PDO::PARAM_INT); |
202
|
|
|
$query->bindParam(':validUntil', $validUntil, PDO::PARAM_INT); |
203
|
|
|
$query->bindParam(':nextUpdate', $nextUpdate, PDO::PARAM_INT); |
204
|
|
|
return $query->execute(); |
205
|
|
|
} |
206
|
|
|
|
207
|
|
|
/** |
208
|
|
|
* Displace push timestamp |
209
|
|
|
* |
210
|
|
|
* @param string $base |
211
|
|
|
* @param int $nextUpdate |
212
|
|
|
* @return bool |
213
|
|
|
*/ |
214
|
|
|
private function displacePush($base, $nextUpdate) |
215
|
|
|
{ |
216
|
|
|
$query = $this->pdo->prepare(<<<SQL |
217
|
|
|
SELECT |
218
|
|
|
validUntil, |
219
|
|
|
UNIX_TIMESTAMP() |
220
|
|
|
FROM robotstxt__cache0 |
221
|
|
|
WHERE base = :base; |
222
|
|
|
SQL |
223
|
|
|
); |
224
|
|
|
$query->bindParam(':base', $base, PDO::PARAM_STR); |
225
|
|
|
$query->execute(); |
226
|
|
|
if ($query->rowCount() > 0) { |
227
|
|
|
$row = $query->fetch(PDO::FETCH_ASSOC); |
228
|
|
|
$this->clockSyncCheck($row['UNIX_TIMESTAMP()']); |
229
|
|
|
if ($row['validUntil'] > $row['UNIX_TIMESTAMP()']) { |
230
|
|
|
$nextUpdate = min($row['validUntil'], $nextUpdate); |
231
|
|
|
$query = $this->pdo->prepare(<<<SQL |
232
|
|
|
UPDATE robotstxt__cache0 |
233
|
|
|
SET nextUpdate = :nextUpdate, worker = NULL |
234
|
|
|
WHERE base = :base; |
235
|
|
|
SQL |
236
|
|
|
); |
237
|
|
|
$query->bindParam(':base', $base, PDO::PARAM_STR); |
238
|
|
|
$query->bindParam(':nextUpdate', $nextUpdate, PDO::PARAM_INT); |
239
|
|
|
return $query->execute(); |
240
|
|
|
} |
241
|
|
|
} |
242
|
|
|
return false; |
243
|
|
|
} |
244
|
|
|
|
245
|
|
|
/** |
246
|
|
|
* Process the update queue |
247
|
|
|
* |
248
|
|
|
* @param int|null $workerID |
249
|
|
|
* @return bool |
250
|
|
|
*/ |
251
|
|
|
public function cron($workerID = null) |
252
|
|
|
{ |
253
|
|
|
$worker = $this->setWorkerID($workerID); |
254
|
|
|
$result = true; |
255
|
|
|
while ($result) { |
256
|
|
|
$query = $this->pdo->prepare(<<<SQL |
257
|
|
|
UPDATE robotstxt__cache0 |
258
|
|
|
SET worker = :workerID |
259
|
|
|
WHERE worker IS NULL AND nextUpdate <= UNIX_TIMESTAMP() |
260
|
|
|
ORDER BY nextUpdate ASC |
261
|
|
|
LIMIT 1; |
262
|
|
|
SELECT base |
263
|
|
|
FROM robotstxt__cache0 |
264
|
|
|
WHERE worker = :workerID; |
265
|
|
|
SQL |
266
|
|
|
); |
267
|
|
|
$query->bindParam(':workerID', $worker, PDO::PARAM_INT); |
268
|
|
|
$query->execute(); |
269
|
|
|
if ($query->rowCount() > 0) { |
270
|
|
|
while ($row = $query->fetch(PDO::FETCH_ASSOC)) { |
271
|
|
|
$result = $this->push(new UriClient($row['base'], $this->guzzleConfig, $this->byteLimit)); |
272
|
|
|
} |
273
|
|
|
continue; |
274
|
|
|
} |
275
|
|
|
return true; |
276
|
|
|
} |
277
|
|
|
return false; |
278
|
|
|
} |
279
|
|
|
|
280
|
|
|
/** |
281
|
|
|
* Set WorkerID |
282
|
|
|
* |
283
|
|
|
* @param int|null $workerID |
284
|
|
|
* @return int |
285
|
|
|
*/ |
286
|
|
|
protected function setWorkerID($workerID = null) |
287
|
|
|
{ |
288
|
|
|
if ( |
289
|
|
|
is_int($workerID) && |
290
|
|
|
$workerID <= 255 && |
291
|
|
|
$workerID >= 1 |
292
|
|
|
) { |
293
|
|
|
return $workerID; |
294
|
|
|
} elseif ($workerID !== null) { |
295
|
|
|
trigger_error('WorkerID out of range (1-255)', E_USER_WARNING); |
296
|
|
|
} |
297
|
|
|
return rand(1, 255); |
298
|
|
|
} |
299
|
|
|
|
300
|
|
|
/** |
301
|
|
|
* Clean the cache table |
302
|
|
|
* |
303
|
|
|
* @param int $delay - in seconds |
304
|
|
|
* @return bool |
305
|
|
|
*/ |
306
|
|
View Code Duplication |
public function clean($delay = 600) |
|
|
|
|
307
|
|
|
{ |
308
|
|
|
$delay = self::CACHE_TIME + $delay; |
309
|
|
|
$query = $this->pdo->prepare(<<<SQL |
310
|
|
|
DELETE FROM robotstxt__cache0 |
311
|
|
|
WHERE worker = 0 AND nextUpdate < (UNIX_TIMESTAMP() - :delay); |
312
|
|
|
SQL |
313
|
|
|
); |
314
|
|
|
$query->bindParam(':delay', $delay, PDO::PARAM_INT); |
315
|
|
|
return $query->execute(); |
316
|
|
|
} |
317
|
|
|
|
318
|
|
|
/** |
319
|
|
|
* Invalidate cache |
320
|
|
|
* |
321
|
|
|
* @param $baseUri |
322
|
|
|
* @return bool |
323
|
|
|
*/ |
324
|
|
|
public function invalidate($baseUri) |
325
|
|
|
{ |
326
|
|
|
$base = $this->urlBase($this->urlEncode($baseUri)); |
327
|
|
|
$query = $this->pdo->prepare(<<<SQL |
328
|
|
|
DELETE FROM robotstxt__cache0 |
329
|
|
|
WHERE base = :base; |
330
|
|
|
SQL |
331
|
|
|
); |
332
|
|
|
$query->bindParam(':base', $base, PDO::PARAM_STR); |
333
|
|
|
return $query->execute(); |
334
|
|
|
} |
335
|
|
|
} |
336
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.