1 | <?php |
||
10 | class ArrayCrawlQueue implements CrawlQueue |
||
11 | { |
||
12 | /** |
||
13 | * All known URLs, indexed by URL string. |
||
14 | * |
||
15 | * @var CrawlUrl[] |
||
16 | */ |
||
17 | protected $urls = []; |
||
18 | |||
19 | /** |
||
20 | * Pending URLs, indexed by URL string. |
||
21 | * |
||
22 | * @var CrawlUrl[] |
||
23 | */ |
||
24 | protected $pendingUrls = []; |
||
25 | |||
26 | public function add(CrawlUrl $url) : CrawlQueue |
||
27 | { |
||
28 | $urlString = (string) $url->url; |
||
29 | |||
30 | if (! isset($this->urls[$urlString])) { |
||
31 | $url->setId($urlString); |
||
32 | |||
33 | $this->urls[$urlString] = $url; |
||
34 | $this->pendingUrls[$urlString] = $url; |
||
35 | } |
||
36 | |||
37 | return $this; |
||
38 | } |
||
39 | |||
40 | public function hasPendingUrls() : bool |
||
41 | { |
||
42 | return (bool) $this->pendingUrls; |
||
43 | } |
||
44 | |||
45 | public function getUrlById($id) : CrawlUrl |
||
46 | { |
||
47 | if (! isset($this->urls[$id])) { |
||
48 | throw new UrlNotFoundByIndex("Crawl url {$id} not found in collection."); |
||
49 | } |
||
50 | |||
51 | return $this->urls[$id]; |
||
52 | } |
||
53 | |||
54 | public function hasAlreadyBeenProcessed(CrawlUrl $url) : bool |
||
55 | { |
||
56 | $url = (string) $url->url; |
||
57 | |||
58 | if (isset($this->pendingUrls[$url])) { |
||
59 | return false; |
||
60 | } |
||
61 | |||
62 | if (isset($this->urls[$url])) { |
||
63 | return true; |
||
64 | } |
||
65 | |||
66 | return false; |
||
67 | } |
||
68 | |||
69 | public function markAsProcessed(CrawlUrl $crawlUrl) |
||
70 | { |
||
71 | $url = (string) $crawlUrl->url; |
||
72 | |||
73 | unset($this->pendingUrls[$url]); |
||
74 | } |
||
75 | |||
76 | /** |
||
77 | * @param CrawlUrl|UriInterface $crawlUrl |
||
78 | * |
||
79 | * @return bool |
||
80 | */ |
||
81 | public function has($crawlUrl) : bool |
||
82 | { |
||
83 | if ($crawlUrl instanceof CrawlUrl) { |
||
84 | $url = (string) $crawlUrl->url; |
||
85 | } elseif ($crawlUrl instanceof UriInterface) { |
||
86 | $url = (string) $crawlUrl; |
||
87 | } else { |
||
88 | throw InvalidUrl::unexpectedType($crawlUrl); |
||
89 | } |
||
90 | |||
91 | return isset($this->urls[$url]); |
||
92 | } |
||
93 | |||
94 | public function getFirstPendingUrl() : ?CrawlUrl |
||
102 | } |
||
103 |