Completed
Pull Request — master (#260)
by
unknown
04:46
created

RedisCrawlQueue   A

Complexity

Total Complexity 17

Size/Duplication

Total Lines 94
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 4

Importance

Changes 0
Metric Value
wmc 17
lcom 1
cbo 4
dl 0
loc 94
rs 10
c 0
b 0
f 0

8 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 7 2
A add() 0 13 2
A has() 0 14 4
A hasPendingUrls() 0 4 1
A getUrlById() 0 7 2
A getFirstPendingUrl() 0 10 2
A hasAlreadyBeenProcessed() 0 14 3
A markAsProcessed() 0 4 1
1
<?php
2
3
namespace Spatie\Crawler\CrawlQueue;
4
5
use Predis\Client;
6
use Psr\Http\Message\UriInterface;
7
use Spatie\Crawler\CrawlUrl;
8
use Spatie\Crawler\Exception\InvalidUrl;
9
use Spatie\Crawler\Exception\UrlNotFoundByIndex;
10
11
/**
12
 * Implementation of CrawlQueue using Redis Hashes
13
 */
14
class RedisCrawlQueue implements CrawlQueue
15
{
16
    // All known URLs, indexed by URL string.
17
    const URLS = 'urls';
18
    // Pending URLs, indexed by URL string.
19
    const PENDING_URLS = 'pending';
20
21
    /**
22
     * Redis Instance
23
     * @var \Predis\Client
24
     */
25
    private $redis;
26
27
    public function __construct(?Client $redis = null)
28
    {
29
        $this->redis = $redis;
30
        if (is_null($redis)) {
31
            $this->redis = new Client();
32
        }
33
    }
34
35
    public function add(CrawlUrl $url) : CrawlQueue
36
    {
37
        $urlString = (string) $url->url;
38
39
        if (!$this->has($urlString)) {
40
            $url->setId($urlString);
41
42
            $this->redis->hset(self::URLS, $urlString, serialize($url));
43
            $this->redis->hset(self::PENDING_URLS, $urlString, serialize($url));
44
        }
45
46
        return $this;
47
    }
48
49
    public function has($crawlUrl) : bool
50
    {
51
        if ($crawlUrl instanceof CrawlUrl) {
52
            $url = (string) $crawlUrl->url;
53
        } elseif ($crawlUrl instanceof UriInterface) {
54
            $url = (string) $crawlUrl;
55
        } elseif (is_string($crawlUrl)) {
56
            $url = $crawlUrl;
57
        } else {
58
            throw InvalidUrl::unexpectedType($crawlUrl);
59
        }
60
61
        return (bool) $this->redis->hexists(self::URLS, $url);
62
    }
63
64
    public function hasPendingUrls() : bool
65
    {
66
        return (bool) $this->redis->hlen(self::PENDING_URLS);
67
    }
68
69
    public function getUrlById($id) : CrawlUrl
70
    {
71
        if (!$this->has($id)) {
72
            throw new UrlNotFoundByIndex("Crawl url {$id} not found in hashes.");
73
        }
74
        return unserialize($this->redis->hget(self::URLS, $id));
75
    }
76
77
    public function getFirstPendingUrl() : ?CrawlUrl
78
    {
79
        $keys = $this->redis->hkeys(self::PENDING_URLS);
80
81
        foreach ($keys as $key) {
82
            return unserialize($this->redis->hget(self::PENDING_URLS, $key));
83
        }
84
85
        return null;
86
    }
87
88
    public function hasAlreadyBeenProcessed(CrawlUrl $url) : bool
89
    {
90
        $url = (string) $url->url;
91
92
        if ($this->redis->hexists(self::PENDING_URLS, $url)) {
93
            return false;
94
        }
95
96
        if ($this->redis->hexists(self::URLS, $url)) {
97
            return true;
98
        }
99
100
        return false;
101
    }
102
103
    public function markAsProcessed(CrawlUrl $crawlUrl)
104
    {
105
        $this->redis->hdel(self::PENDING_URLS, (string) $crawlUrl->url);
106
    }
107
}
108