1
|
|
|
<?php |
2
|
|
|
namespace vipnytt\RobotsTxtParser; |
3
|
|
|
|
4
|
|
|
use PDO; |
5
|
|
|
use vipnytt\RobotsTxtParser\Exceptions\ClientException; |
6
|
|
|
use vipnytt\RobotsTxtParser\Exceptions\SQLException; |
7
|
|
|
use vipnytt\RobotsTxtParser\Parser\UriParser; |
8
|
|
|
|
9
|
|
|
/** |
10
|
|
|
* Class Cache |
11
|
|
|
* |
12
|
|
|
* @see https://github.com/VIPnytt/RobotsTxtParser/blob/master/docs/methods/Cache.md for documentation |
13
|
|
|
* @package vipnytt\RobotsTxtParser |
14
|
|
|
*/ |
15
|
|
|
class Cache implements RobotsTxtInterface, SQLInterface |
16
|
|
|
{ |
17
|
|
|
use UriParser; |
18
|
|
|
|
19
|
|
|
/** |
20
|
|
|
* Supported database drivers |
21
|
|
|
*/ |
22
|
|
|
const SUPPORTED_DRIVERS = [ |
23
|
|
|
self::DRIVER_MYSQL, |
24
|
|
|
]; |
25
|
|
|
|
26
|
|
|
/** |
27
|
|
|
* Client nextUpdate margin in seconds |
28
|
|
|
* @var int |
29
|
|
|
*/ |
30
|
|
|
protected $clientUpdateMargin = 300; |
31
|
|
|
|
32
|
|
|
/** |
33
|
|
|
* Database handler |
34
|
|
|
* @var PDO |
35
|
|
|
*/ |
36
|
|
|
private $pdo; |
37
|
|
|
|
38
|
|
|
/** |
39
|
|
|
* cURL options |
40
|
|
|
* @var array |
41
|
|
|
*/ |
42
|
|
|
private $curlOptions = []; |
43
|
|
|
|
44
|
|
|
/** |
45
|
|
|
* Byte limit |
46
|
|
|
* @var int|null |
47
|
|
|
*/ |
48
|
|
|
private $byteLimit = self::BYTE_LIMIT; |
49
|
|
|
|
50
|
|
|
/** |
51
|
|
|
* PDO driver |
52
|
|
|
* @var string |
53
|
|
|
*/ |
54
|
|
|
private $driver; |
55
|
|
|
|
56
|
|
|
/** |
57
|
|
|
* Cache constructor. |
58
|
|
|
* |
59
|
|
|
* @param PDO $pdo |
60
|
|
|
* @param array $curlOptions |
61
|
|
|
* @param int|null $byteLimit |
62
|
|
|
*/ |
63
|
|
|
public function __construct(PDO $pdo, array $curlOptions = [], $byteLimit = self::BYTE_LIMIT) |
64
|
|
|
{ |
65
|
|
|
$this->pdo = $this->pdoInitialize($pdo); |
66
|
|
|
$this->curlOptions = $curlOptions; |
67
|
|
|
$this->byteLimit = $byteLimit; |
68
|
|
|
} |
69
|
|
|
|
70
|
|
|
/** |
71
|
|
|
* Initialize PDO connection |
72
|
|
|
* |
73
|
|
|
* @param PDO $pdo |
74
|
|
|
* @return PDO |
75
|
|
|
* @throws SQLException |
76
|
|
|
*/ |
77
|
|
|
private function pdoInitialize(PDO $pdo) |
78
|
|
|
{ |
79
|
|
|
if ($pdo->getAttribute(PDO::ATTR_ERRMODE) === PDO::ERRMODE_SILENT) { |
80
|
|
|
$pdo->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_WARNING); |
81
|
|
|
} |
82
|
|
|
$pdo->setAttribute(PDO::ATTR_CASE, PDO::CASE_NATURAL); |
83
|
|
|
$pdo->setAttribute(PDO::ATTR_ORACLE_NULLS, PDO::NULL_NATURAL); |
84
|
|
|
$pdo->exec('SET NAMES ' . self::SQL_ENCODING); |
85
|
|
|
$this->driver = $pdo->getAttribute(PDO::ATTR_DRIVER_NAME); |
86
|
|
|
if (!in_array($this->driver, self::SUPPORTED_DRIVERS)) { |
87
|
|
|
throw new SQLException('Unsupported database. ' . self::README_SQL_CACHE); |
88
|
|
|
} |
89
|
|
|
try { |
90
|
|
|
$pdo->query("SELECT 1 FROM robotstxt__cache1 LIMIT 1;"); |
91
|
|
|
} catch (\Exception $exception1) { |
92
|
|
|
try { |
93
|
|
|
$pdo->query(file_get_contents(__DIR__ . '/../res/cache.sql')); |
94
|
|
|
} catch (\Exception $exception2) { |
95
|
|
|
throw new SQLException('Missing table `' . self::TABLE_CACHE . '`. Setup instructions: ' . self::README_SQL_CACHE); |
96
|
|
|
} |
97
|
|
|
} |
98
|
|
|
return $pdo; |
99
|
|
|
} |
100
|
|
|
|
101
|
|
|
/** |
102
|
|
|
* Parser client |
103
|
|
|
* |
104
|
|
|
* @param string $baseUri |
105
|
|
|
* @return TxtClient |
106
|
|
|
*/ |
107
|
|
|
public function client($baseUri) |
108
|
|
|
{ |
109
|
|
|
$base = $this->urlBase($baseUri); |
110
|
|
|
$query = $this->pdo->prepare(<<<SQL |
111
|
|
|
SELECT |
112
|
|
|
content, |
113
|
|
|
statusCode, |
114
|
|
|
nextUpdate, |
115
|
|
|
effective, |
116
|
|
|
worker, |
117
|
|
|
UNIX_TIMESTAMP() |
118
|
|
|
FROM robotstxt__cache1 |
119
|
|
|
WHERE base = :base; |
120
|
|
|
SQL |
121
|
|
|
); |
122
|
|
|
$query->bindParam(':base', $base, PDO::PARAM_STR); |
123
|
|
|
$query->execute(); |
124
|
|
|
if ($query->rowCount() > 0) { |
125
|
|
|
$row = $query->fetch(PDO::FETCH_ASSOC); |
126
|
|
|
$this->clockSyncCheck($row['UNIX_TIMESTAMP()']); |
127
|
|
|
if ($row['nextUpdate'] > ($row['UNIX_TIMESTAMP()'] - $this->clientUpdateMargin)) { |
128
|
|
|
$this->markAsActive($base, $row['worker']); |
129
|
|
|
return new TxtClient($base, $row['statusCode'], $row['content'], self::ENCODING, $row['effective'], $this->byteLimit); |
130
|
|
|
} |
131
|
|
|
} |
132
|
|
|
$request = new UriClient($base, $this->curlOptions, $this->byteLimit); |
133
|
|
|
$this->push($request); |
134
|
|
|
$this->markAsActive($base); |
135
|
|
|
return new TxtClient($base, $request->getStatusCode(), $request->getContents(), $request->getEncoding(), $request->getEffectiveUri(), $this->byteLimit); |
136
|
|
|
} |
137
|
|
|
|
138
|
|
|
/** |
139
|
|
|
* Clock sync check |
140
|
|
|
* |
141
|
|
|
* @param int $time |
142
|
|
|
* @throws SQLException |
143
|
|
|
*/ |
144
|
|
|
private function clockSyncCheck($time) |
145
|
|
|
{ |
146
|
|
|
if (abs(time() - $time) >= 10) { |
147
|
|
|
throw new SQLException('`PHP server` and `SQL server` timestamps are out of sync. Please fix!'); |
148
|
|
|
} |
149
|
|
|
} |
150
|
|
|
|
151
|
|
|
/** |
152
|
|
|
* Mark robots.txt as active |
153
|
|
|
* |
154
|
|
|
* @param string $base |
155
|
|
|
* @param int|null $workerID |
156
|
|
|
* @return bool |
157
|
|
|
*/ |
158
|
|
View Code Duplication |
private function markAsActive($base, $workerID = 0) |
|
|
|
|
159
|
|
|
{ |
160
|
|
|
if ($workerID == 0) { |
161
|
|
|
$query = $this->pdo->prepare(<<<SQL |
162
|
|
|
UPDATE robotstxt__cache1 |
163
|
|
|
SET worker = NULL |
164
|
|
|
WHERE base = :base AND worker = 0; |
165
|
|
|
SQL |
166
|
|
|
); |
167
|
|
|
$query->bindParam(':base', $base, PDO::PARAM_STR); |
168
|
|
|
return $query->execute(); |
169
|
|
|
} |
170
|
|
|
return true; |
171
|
|
|
} |
172
|
|
|
|
173
|
|
|
/** |
174
|
|
|
* Update an robots.txt in the database |
175
|
|
|
* |
176
|
|
|
* @param UriClient $client |
177
|
|
|
* @return bool |
178
|
|
|
*/ |
179
|
|
|
private function push(UriClient $client) |
180
|
|
|
{ |
181
|
|
|
$base = $client->getBaseUri(); |
182
|
|
|
$statusCode = $client->getStatusCode(); |
183
|
|
|
$nextUpdate = $client->nextUpdate(); |
184
|
|
|
$effective = ($effective = $client->getEffectiveUri()) === $base ? null : $effective; |
185
|
|
|
if ( |
186
|
|
|
stripos($base, 'http') === 0 && |
187
|
|
|
( |
188
|
|
|
$statusCode === null || |
189
|
|
|
( |
190
|
|
|
$statusCode >= 500 && |
191
|
|
|
$statusCode < 600 |
192
|
|
|
) |
193
|
|
|
) && |
194
|
|
|
$this->displacePush($base, $nextUpdate) |
195
|
|
|
) { |
196
|
|
|
return true; |
197
|
|
|
} |
198
|
|
|
$validUntil = $client->validUntil(); |
199
|
|
|
$content = $client->render(); |
200
|
|
|
$query = $this->pdo->prepare(<<<SQL |
201
|
|
|
INSERT INTO robotstxt__cache1 (base, content, statusCode, validUntil, nextUpdate, effective) |
202
|
|
|
VALUES (:base, :content, :statusCode, :validUntil, :nextUpdate, :effective) |
203
|
|
|
ON DUPLICATE KEY UPDATE content = :content, statusCode = :statusCode, validUntil = :validUntil, |
204
|
|
|
nextUpdate = :nextUpdate, effective = :effective, worker = 0; |
205
|
|
|
SQL |
206
|
|
|
); |
207
|
|
|
$query->bindParam(':base', $base, PDO::PARAM_STR); |
208
|
|
|
$query->bindParam(':content', $content, PDO::PARAM_STR); |
209
|
|
|
$query->bindParam(':statusCode', $statusCode, PDO::PARAM_INT | PDO::PARAM_NULL); |
210
|
|
|
$query->bindParam(':validUntil', $validUntil, PDO::PARAM_INT); |
211
|
|
|
$query->bindParam(':nextUpdate', $nextUpdate, PDO::PARAM_INT); |
212
|
|
|
$query->bindParam(':effective', $effective, PDO::PARAM_STR | PDO::PARAM_NULL); |
213
|
|
|
return $query->execute(); |
214
|
|
|
} |
215
|
|
|
|
216
|
|
|
/** |
217
|
|
|
* Displace push timestamp |
218
|
|
|
* |
219
|
|
|
* @param string $base |
220
|
|
|
* @param int $nextUpdate |
221
|
|
|
* @return bool |
222
|
|
|
*/ |
223
|
|
|
private function displacePush($base, $nextUpdate) |
224
|
|
|
{ |
225
|
|
|
$query = $this->pdo->prepare(<<<SQL |
226
|
|
|
SELECT |
227
|
|
|
validUntil, |
228
|
|
|
UNIX_TIMESTAMP() |
229
|
|
|
FROM robotstxt__cache1 |
230
|
|
|
WHERE base = :base; |
231
|
|
|
SQL |
232
|
|
|
); |
233
|
|
|
$query->bindParam(':base', $base, PDO::PARAM_STR); |
234
|
|
|
$query->execute(); |
235
|
|
|
if ($query->rowCount() > 0) { |
236
|
|
|
$row = $query->fetch(PDO::FETCH_ASSOC); |
237
|
|
|
$this->clockSyncCheck($row['UNIX_TIMESTAMP()']); |
238
|
|
|
if ($row['validUntil'] > $row['UNIX_TIMESTAMP()']) { |
239
|
|
|
$nextUpdate = min($row['validUntil'], $nextUpdate); |
240
|
|
|
$query = $this->pdo->prepare(<<<SQL |
241
|
|
|
UPDATE robotstxt__cache1 |
242
|
|
|
SET nextUpdate = :nextUpdate, worker = NULL |
243
|
|
|
WHERE base = :base; |
244
|
|
|
SQL |
245
|
|
|
); |
246
|
|
|
$query->bindParam(':base', $base, PDO::PARAM_STR); |
247
|
|
|
$query->bindParam(':nextUpdate', $nextUpdate, PDO::PARAM_INT); |
248
|
|
|
return $query->execute(); |
249
|
|
|
} |
250
|
|
|
$this->invalidate($base); |
251
|
|
|
} |
252
|
|
|
return false; |
253
|
|
|
} |
254
|
|
|
|
255
|
|
|
/** |
256
|
|
|
* Invalidate cache |
257
|
|
|
* |
258
|
|
|
* @param $baseUri |
259
|
|
|
* @return bool |
260
|
|
|
*/ |
261
|
|
|
public function invalidate($baseUri) |
262
|
|
|
{ |
263
|
|
|
$base = $this->urlBase($baseUri); |
264
|
|
|
$query = $this->pdo->prepare(<<<SQL |
265
|
|
|
DELETE FROM robotstxt__cache1 |
266
|
|
|
WHERE base = :base; |
267
|
|
|
SQL |
268
|
|
|
); |
269
|
|
|
$query->bindParam(':base', $base, PDO::PARAM_STR); |
270
|
|
|
return $query->execute(); |
271
|
|
|
} |
272
|
|
|
|
273
|
|
|
/** |
274
|
|
|
* Process the update queue |
275
|
|
|
* |
276
|
|
|
* @param float|int $targetTime |
277
|
|
|
* @param int|null $workerID |
278
|
|
|
* @return string[]|false |
279
|
|
|
* @throws ClientException |
280
|
|
|
*/ |
281
|
|
|
public function cron($targetTime = 60, $workerID = null) |
282
|
|
|
{ |
283
|
|
|
$start = microtime(true); |
284
|
|
|
$worker = $this->setWorkerID($workerID); |
285
|
|
|
$log = []; |
286
|
|
|
$count = 1; |
287
|
|
|
while ( |
288
|
|
|
$count > 0 && |
289
|
|
|
$targetTime > microtime(true) - $start |
290
|
|
|
) { |
291
|
|
|
$query = $this->pdo->prepare(<<<SQL |
292
|
|
|
UPDATE robotstxt__cache1 |
293
|
|
|
SET worker = :workerID |
294
|
|
|
WHERE worker IS NULL AND nextUpdate <= UNIX_TIMESTAMP() |
295
|
|
|
ORDER BY nextUpdate ASC |
296
|
|
|
LIMIT 1; |
297
|
|
|
SELECT base |
298
|
|
|
FROM robotstxt__cache1 |
299
|
|
|
WHERE worker = :workerID |
300
|
|
|
LIMIT 100; |
301
|
|
|
SQL |
302
|
|
|
); |
303
|
|
|
$query->bindParam(':workerID', $worker, PDO::PARAM_INT); |
304
|
|
|
$query->execute(); |
305
|
|
|
if (($count = $query->rowCount()) > 0) { |
306
|
|
|
while ( |
307
|
|
|
$targetTime > microtime(true) - $start && |
308
|
|
|
($row = $query->fetch(PDO::FETCH_ASSOC)) |
309
|
|
|
) { |
310
|
|
|
if (!$this->push(new UriClient($row['base'], $this->curlOptions, $this->byteLimit))) { |
311
|
|
|
throw new ClientException('Unable to update `' . $row['base'] . '`'); |
312
|
|
|
} |
313
|
|
|
$log[] = $row['base']; |
314
|
|
|
} |
315
|
|
|
} |
316
|
|
|
} |
317
|
|
|
return $log; |
318
|
|
|
} |
319
|
|
|
|
320
|
|
|
/** |
321
|
|
|
* Set WorkerID |
322
|
|
|
* |
323
|
|
|
* @param int|null $workerID |
324
|
|
|
* @return int |
325
|
|
|
*/ |
326
|
|
|
protected function setWorkerID($workerID = null) |
327
|
|
|
{ |
328
|
|
|
if ( |
329
|
|
|
is_int($workerID) && |
330
|
|
|
$workerID <= 255 && |
331
|
|
|
$workerID >= 1 |
332
|
|
|
) { |
333
|
|
|
return $workerID; |
334
|
|
|
} elseif ($workerID !== null) { |
335
|
|
|
trigger_error('WorkerID out of range (1-255)', E_USER_WARNING); |
336
|
|
|
} |
337
|
|
|
return rand(1, 255); |
338
|
|
|
} |
339
|
|
|
|
340
|
|
|
/** |
341
|
|
|
* Clean the cache table |
342
|
|
|
* |
343
|
|
|
* @param int $delay - in seconds |
344
|
|
|
* @return bool |
345
|
|
|
*/ |
346
|
|
View Code Duplication |
public function clean($delay = 600) |
|
|
|
|
347
|
|
|
{ |
348
|
|
|
$delay = self::CACHE_TIME + $delay; |
349
|
|
|
$query = $this->pdo->prepare(<<<SQL |
350
|
|
|
DELETE FROM robotstxt__cache1 |
351
|
|
|
WHERE worker = 0 AND nextUpdate < (UNIX_TIMESTAMP() - :delay); |
352
|
|
|
SQL |
353
|
|
|
); |
354
|
|
|
$query->bindParam(':delay', $delay, PDO::PARAM_INT); |
355
|
|
|
return $query->execute(); |
356
|
|
|
} |
357
|
|
|
} |
358
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.