1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Spatie\Crawler; |
4
|
|
|
|
5
|
|
|
use Spatie\Crawler\Exception\UrlNotFoundByIndex; |
6
|
|
|
|
7
|
|
|
class PdoCrawlQueue implements CrawlQueue |
8
|
|
|
{ |
9
|
|
|
const STATUS_PENDING = 1; |
10
|
|
|
const STATUS_PROCESSED = 2; |
11
|
|
|
|
12
|
|
|
/** @var \Pdo */ |
13
|
|
|
private $pdo; |
14
|
|
|
|
15
|
|
|
/** @var string */ |
16
|
|
|
private $table; |
17
|
|
|
|
18
|
|
|
public function __construct(\Pdo $pdo, string $table) |
19
|
|
|
{ |
20
|
|
|
$this->pdo = $pdo; |
21
|
|
|
$this->table = $table; |
22
|
|
|
} |
23
|
|
|
|
24
|
|
|
public function add(CrawlUrl $url): CrawlQueue |
25
|
|
|
{ |
26
|
|
|
if ($this->has($url)) { |
27
|
|
|
return $this; |
28
|
|
|
} |
29
|
|
|
|
30
|
|
|
$data = serialize($url); |
31
|
|
|
|
32
|
|
|
$this->pdo->beginTransaction(); |
33
|
|
|
|
34
|
|
|
try { |
35
|
|
|
$statement = $this->pdo->prepare( |
36
|
|
|
'insert into '.$this->table.' (url, status, object) values (:url, :status, :object)' |
37
|
|
|
); |
38
|
|
|
$statement->execute([ |
39
|
|
|
'url' => $url->url, |
40
|
|
|
'object' => $data, |
41
|
|
|
'status' => self::STATUS_PENDING, |
42
|
|
|
]); |
43
|
|
|
$statement->closeCursor(); |
44
|
|
|
$url->setId($this->pdo->lastInsertId()); |
45
|
|
|
|
46
|
|
|
$statement = $this->pdo->prepare( |
47
|
|
|
'update '.$this->table.' set object = :object where id = :id' |
48
|
|
|
); |
49
|
|
|
|
50
|
|
|
$data = serialize($url); |
51
|
|
|
|
52
|
|
|
$statement->execute([ |
53
|
|
|
'object' => $data, |
54
|
|
|
'id' => $url->getId(), |
55
|
|
|
]); |
56
|
|
|
$statement->closeCursor(); |
57
|
|
|
$this->pdo->commit(); |
58
|
|
|
} catch (\Exception $exception) { |
59
|
|
|
$this->pdo->rollBack(); |
60
|
|
|
throw $exception; |
61
|
|
|
} |
62
|
|
|
|
63
|
|
|
return $this; |
64
|
|
|
} |
65
|
|
|
|
66
|
|
|
public function hasPendingUrls(): bool |
67
|
|
|
{ |
68
|
|
|
$statement = $this->pdo->prepare( |
69
|
|
|
'select url from '.$this->table.' where status = :status limit 1' |
70
|
|
|
); |
71
|
|
|
$statement->execute([ |
72
|
|
|
'status' => self::STATUS_PENDING, |
73
|
|
|
]); |
74
|
|
|
$result = $statement->fetch(); |
75
|
|
|
$statement->closeCursor(); |
76
|
|
|
|
77
|
|
|
return (bool) $result; |
78
|
|
|
} |
79
|
|
|
|
80
|
|
|
/** |
81
|
|
|
* @param int $id |
82
|
|
|
* |
83
|
|
|
* @return \Spatie\Crawler\CrawlUrl|null |
84
|
|
|
*/ |
85
|
|
|
public function getUrlById(int $id): CrawlUrl |
86
|
|
|
{ |
87
|
|
|
$statement = $this->pdo->prepare( |
88
|
|
|
'select object from '.$this->table.' where id = :id' |
89
|
|
|
); |
90
|
|
|
$statement->execute([ |
91
|
|
|
'id' => $id, |
92
|
|
|
]); |
93
|
|
|
$result = $statement->fetch(); |
94
|
|
|
$statement->closeCursor(); |
95
|
|
|
|
96
|
|
|
if (! $result) { |
97
|
|
|
throw new UrlNotFoundByIndex(sprintf('#%d crawl url not found in database', $id)); |
98
|
|
|
} |
99
|
|
|
|
100
|
|
|
return unserialize($result['object']); |
101
|
|
|
} |
102
|
|
|
|
103
|
|
View Code Duplication |
public function hasAlreadyBeenProcessed(CrawlUrl $url) |
|
|
|
|
104
|
|
|
{ |
105
|
|
|
$statement = $this->pdo->prepare( |
106
|
|
|
'select id from '.$this->table.' where url = :url and status = :status' |
107
|
|
|
); |
108
|
|
|
$statement->execute([ |
109
|
|
|
'url' => $url->url, |
110
|
|
|
'status' => self::STATUS_PROCESSED, |
111
|
|
|
]); |
112
|
|
|
$result = $statement->fetch(); |
113
|
|
|
$statement->closeCursor(); |
114
|
|
|
|
115
|
|
|
return (bool) $result; |
116
|
|
|
} |
117
|
|
|
|
118
|
|
|
public function markAsProcessed(CrawlUrl $crawlUrl) |
119
|
|
|
{ |
120
|
|
|
$statement = $this->pdo->prepare( |
121
|
|
|
'update '.$this->table.' set status = :status where url = :url' |
122
|
|
|
); |
123
|
|
|
$statement->execute([ |
124
|
|
|
'url' => $crawlUrl->url, |
125
|
|
|
'status' => self::STATUS_PROCESSED, |
126
|
|
|
]); |
127
|
|
|
$statement->closeCursor(); |
128
|
|
|
} |
129
|
|
|
|
130
|
|
|
/** |
131
|
|
|
* @param CrawlUrl|Url $crawlUrl |
132
|
|
|
* |
133
|
|
|
* @return bool |
134
|
|
|
*/ |
135
|
|
|
public function has($crawlUrl): bool |
136
|
|
|
{ |
137
|
|
|
if ($crawlUrl instanceof Url) { |
138
|
|
|
$crawlUrl = CrawlUrl::create($crawlUrl); |
139
|
|
|
} |
140
|
|
|
|
141
|
|
|
$statement = $this->pdo->prepare( |
142
|
|
|
'select id from '.$this->table.' where url = :url' |
143
|
|
|
); |
144
|
|
|
|
145
|
|
|
$statement->execute([ |
146
|
|
|
'url' => $crawlUrl->url, |
147
|
|
|
]); |
148
|
|
|
$result = $statement->fetch(); |
149
|
|
|
|
150
|
|
|
return (bool) $result; |
151
|
|
|
} |
152
|
|
|
|
153
|
|
|
/** @return \Spatie\Crawler\CrawlUrl|null */ |
154
|
|
View Code Duplication |
public function getPendingUrl() |
|
|
|
|
155
|
|
|
{ |
156
|
|
|
$statement = $this->pdo->prepare( |
157
|
|
|
'select object from '.$this->table.' where status = :status limit 1' |
158
|
|
|
); |
159
|
|
|
$statement->execute([ |
160
|
|
|
'status' => self::STATUS_PENDING, |
161
|
|
|
]); |
162
|
|
|
$result = $statement->fetch(); |
163
|
|
|
$statement->closeCursor(); |
164
|
|
|
|
165
|
|
|
if (! $result) { |
166
|
|
|
return; |
167
|
|
|
} |
168
|
|
|
|
169
|
|
|
return unserialize($result['object']); |
170
|
|
|
} |
171
|
|
|
} |
172
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.