1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Spatie\Crawler\CrawlQueue; |
4
|
|
|
|
5
|
|
|
use Predis\Client; |
6
|
|
|
use Psr\Http\Message\UriInterface; |
7
|
|
|
use Spatie\Crawler\CrawlUrl; |
8
|
|
|
use Spatie\Crawler\Exception\InvalidUrl; |
9
|
|
|
use Spatie\Crawler\Exception\UrlNotFoundByIndex; |
10
|
|
|
|
11
|
|
|
/** |
12
|
|
|
* Implementation of CrawlQueue using Redis Hashes |
13
|
|
|
*/ |
14
|
|
|
class RedisCrawlQueue implements CrawlQueue |
15
|
|
|
{ |
16
|
|
|
// All known URLs, indexed by URL string. |
17
|
|
|
const URLS = 'urls'; |
18
|
|
|
// Pending URLs, indexed by URL string. |
19
|
|
|
const PENDING_URLS = 'pending'; |
20
|
|
|
|
21
|
|
|
/** |
22
|
|
|
* Redis Instance |
23
|
|
|
* @var \Predis\Client |
24
|
|
|
*/ |
25
|
|
|
private $redis; |
26
|
|
|
|
27
|
|
|
public function __construct(?Client $redis = null) |
28
|
|
|
{ |
29
|
|
|
$this->redis = $redis; |
30
|
|
|
if (is_null($redis)) { |
31
|
|
|
$this->redis = new Client(); |
32
|
|
|
} |
33
|
|
|
} |
34
|
|
|
|
35
|
|
|
public function add(CrawlUrl $url) : CrawlQueue |
36
|
|
|
{ |
37
|
|
|
$urlString = (string) $url->url; |
38
|
|
|
|
39
|
|
|
if (!$this->has($urlString)) { |
40
|
|
|
$url->setId($urlString); |
41
|
|
|
|
42
|
|
|
$this->redis->hset(self::URLS, $urlString, serialize($url)); |
43
|
|
|
$this->redis->hset(self::PENDING_URLS, $urlString, serialize($url)); |
44
|
|
|
} |
45
|
|
|
|
46
|
|
|
return $this; |
47
|
|
|
} |
48
|
|
|
|
49
|
|
|
public function has($crawlUrl) : bool |
50
|
|
|
{ |
51
|
|
|
if ($crawlUrl instanceof CrawlUrl) { |
52
|
|
|
$url = (string) $crawlUrl->url; |
53
|
|
|
} elseif ($crawlUrl instanceof UriInterface) { |
54
|
|
|
$url = (string) $crawlUrl; |
55
|
|
|
} elseif (is_string($crawlUrl)) { |
56
|
|
|
$url = $crawlUrl; |
57
|
|
|
} else { |
58
|
|
|
throw InvalidUrl::unexpectedType($crawlUrl); |
59
|
|
|
} |
60
|
|
|
|
61
|
|
|
return (bool) $this->redis->hexists(self::URLS, $url); |
62
|
|
|
} |
63
|
|
|
|
64
|
|
|
public function hasPendingUrls() : bool |
65
|
|
|
{ |
66
|
|
|
return (bool) $this->redis->hlen(self::PENDING_URLS); |
67
|
|
|
} |
68
|
|
|
|
69
|
|
|
public function getUrlById($id) : CrawlUrl |
70
|
|
|
{ |
71
|
|
|
if (!$this->has($id)) { |
72
|
|
|
throw new UrlNotFoundByIndex("Crawl url {$id} not found in hashes."); |
73
|
|
|
} |
74
|
|
|
return unserialize($this->redis->hget(self::URLS, $id)); |
75
|
|
|
} |
76
|
|
|
|
77
|
|
|
public function getFirstPendingUrl() : ?CrawlUrl |
78
|
|
|
{ |
79
|
|
|
$keys = $this->redis->hkeys(self::PENDING_URLS); |
80
|
|
|
|
81
|
|
|
foreach ($keys as $key) { |
82
|
|
|
return unserialize($this->redis->hget(self::PENDING_URLS, $key)); |
83
|
|
|
} |
84
|
|
|
|
85
|
|
|
return null; |
86
|
|
|
} |
87
|
|
|
|
88
|
|
|
public function hasAlreadyBeenProcessed(CrawlUrl $url) : bool |
89
|
|
|
{ |
90
|
|
|
$url = (string) $url->url; |
91
|
|
|
|
92
|
|
|
if ($this->redis->hexists(self::PENDING_URLS, $url)) { |
93
|
|
|
return false; |
94
|
|
|
} |
95
|
|
|
|
96
|
|
|
if ($this->redis->hexists(self::URLS, $url)) { |
97
|
|
|
return true; |
98
|
|
|
} |
99
|
|
|
|
100
|
|
|
return false; |
101
|
|
|
} |
102
|
|
|
|
103
|
|
|
public function markAsProcessed(CrawlUrl $crawlUrl) |
104
|
|
|
{ |
105
|
|
|
$this->redis->hdel(self::PENDING_URLS, (string) $crawlUrl->url); |
106
|
|
|
} |
107
|
|
|
} |
108
|
|
|
|