1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Spatie\Crawler\CrawlQueue; |
4
|
|
|
|
5
|
|
|
use TypeError; |
6
|
|
|
use Spatie\Crawler\CrawlUrl; |
7
|
|
|
use Psr\Http\Message\UriInterface; |
8
|
|
|
use Spatie\Crawler\Exception\UrlNotFoundByIndex; |
9
|
|
|
|
10
|
|
|
/** |
11
|
|
|
* Crawl queue implemented with arrays. |
12
|
|
|
*/ |
13
|
|
|
class ArrayCrawlQueue implements CrawlQueue |
14
|
|
|
{ |
15
|
|
|
/** |
16
|
|
|
* All known URLs, indexed by URL string. |
17
|
|
|
* |
18
|
|
|
* @var CrawlUrl[] |
19
|
|
|
*/ |
20
|
|
|
protected $urls = []; |
21
|
|
|
|
22
|
|
|
/** |
23
|
|
|
* Pending URLs, indexed by URL string. |
24
|
|
|
* |
25
|
|
|
* @var CrawlUrl[] |
26
|
|
|
*/ |
27
|
|
|
protected $pendingUrls = []; |
28
|
|
|
|
29
|
|
|
public function add(CrawlUrl $url) : CrawlQueue |
30
|
|
|
{ |
31
|
|
|
$urlString = (string) $url->url; |
32
|
|
|
|
33
|
|
|
if (! isset($this->urls[$urlString])) { |
34
|
|
|
$url->setId($urlString); |
35
|
|
|
|
36
|
|
|
$this->urls[$urlString] = $url; |
37
|
|
|
$this->pendingUrls[$urlString] = $url; |
38
|
|
|
} |
39
|
|
|
|
40
|
|
|
return $this; |
41
|
|
|
} |
42
|
|
|
|
43
|
|
|
public function hasPendingUrls() : bool |
44
|
|
|
{ |
45
|
|
|
return (bool) $this->pendingUrls; |
46
|
|
|
} |
47
|
|
|
|
48
|
|
|
public function getUrlById($id) : CrawlUrl |
49
|
|
|
{ |
50
|
|
|
if (! isset($this->urls[$id])) { |
51
|
|
|
throw new UrlNotFoundByIndex("Crawl url $id not found in collection."); |
52
|
|
|
} |
53
|
|
|
|
54
|
|
|
return $this->urls[$id]; |
55
|
|
|
} |
56
|
|
|
|
57
|
|
|
public function hasAlreadyBeenProcessed(CrawlUrl $url) : bool |
58
|
|
|
{ |
59
|
|
|
$url = (string) $url->url; |
60
|
|
|
|
61
|
|
|
return ! isset($this->pendingUrls[$url]) && isset($this->pendingUrls[$url]); |
62
|
|
|
} |
63
|
|
|
|
64
|
|
|
public function markAsProcessed(CrawlUrl $crawlUrl) |
65
|
|
|
{ |
66
|
|
|
$url = (string) $crawlUrl->url; |
67
|
|
|
|
68
|
|
|
unset($this->pendingUrls[$url]); |
69
|
|
|
} |
70
|
|
|
|
71
|
|
|
/** |
72
|
|
|
* @param CrawlUrl|UriInterface $crawlUrl |
73
|
|
|
* |
74
|
|
|
* @return bool |
75
|
|
|
*/ |
76
|
|
|
public function has($crawlUrl) : bool |
77
|
|
|
{ |
78
|
|
|
if ($crawlUrl instanceof CrawlUrl) { |
79
|
|
|
$url = (string) $crawlUrl->url; |
80
|
|
|
} elseif ($crawlUrl instanceof UriInterface) { |
81
|
|
|
$url = (string) $crawlUrl; |
82
|
|
|
} else { |
83
|
|
|
throw new TypeError(sprintf( |
|
|
|
|
84
|
|
|
'Expected %s or %s, got %s.', |
85
|
|
|
CrawlUrl::class, |
86
|
|
|
UriInterface::class, |
87
|
|
|
is_object($crawlUrl) ? get_class($crawlUrl) : gettype($crawlUrl) |
88
|
|
|
)); |
89
|
|
|
} |
90
|
|
|
|
91
|
|
|
return isset($this->urls[$url]); |
92
|
|
|
} |
93
|
|
|
|
94
|
|
|
public function getFirstPendingUrl() : ?CrawlUrl |
95
|
|
|
{ |
96
|
|
|
foreach ($this->pendingUrls as $pendingUrl) { |
97
|
|
|
return $pendingUrl; |
98
|
|
|
} |
99
|
|
|
|
100
|
|
|
return null; |
101
|
|
|
} |
102
|
|
|
} |
103
|
|
|
|
This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.
If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.
In this case you can add the
@ignore
PhpDoc annotation to the duplicate definition and it will be ignored.