1
|
|
|
<?php |
2
|
|
|
namespace VDB\Spider; |
3
|
|
|
|
4
|
|
|
use Exception; |
5
|
|
|
use Guzzle\Http\Message\Response; |
6
|
|
|
use PHPUnit_Framework_MockObject_MockObject; |
7
|
|
|
use VDB\Spider\Discoverer\XPathExpressionDiscoverer; |
8
|
|
|
use VDB\Spider\Tests\TestCase; |
9
|
|
|
use VDB\Spider\QueueManager\InMemoryQueueManager; |
10
|
|
|
use VDB\Spider\StatsHandler; |
11
|
|
|
use VDB\Spider\Uri\FilterableUri; |
12
|
|
|
|
13
|
|
|
/** |
14
|
|
|
*/ |
15
|
|
|
class SpiderTest extends TestCase |
16
|
|
|
{ |
17
|
|
|
/** |
18
|
|
|
* @var Spider |
19
|
|
|
*/ |
20
|
|
|
protected $spider; |
21
|
|
|
|
22
|
|
|
/** |
23
|
|
|
* @var StatsHandler |
24
|
|
|
*/ |
25
|
|
|
protected $statsHandler; |
26
|
|
|
|
27
|
|
|
/** |
28
|
|
|
* @var PHPUnit_Framework_MockObject_MockObject |
29
|
|
|
*/ |
30
|
|
|
protected $requestHandler; |
31
|
|
|
|
32
|
|
|
/** @var FilterableUri */ |
33
|
|
|
protected $linkA; |
34
|
|
|
/** @var FilterableUri */ |
35
|
|
|
protected $linkB; |
36
|
|
|
/** @var FilterableUri */ |
37
|
|
|
protected $linkC; |
38
|
|
|
/** @var FilterableUri */ |
39
|
|
|
protected $linkD; |
40
|
|
|
/** @var FilterableUri */ |
41
|
|
|
protected $linkE; |
42
|
|
|
/** @var FilterableUri */ |
43
|
|
|
protected $linkF; |
44
|
|
|
/** @var FilterableUri */ |
45
|
|
|
protected $linkG; |
46
|
|
|
|
47
|
|
|
/** @var Response */ |
48
|
|
|
protected $responseA; |
49
|
|
|
/** @var Response */ |
50
|
|
|
protected $responseB; |
51
|
|
|
/** @var Response */ |
52
|
|
|
protected $responseC; |
53
|
|
|
/** @var Response */ |
54
|
|
|
protected $responseD; |
55
|
|
|
/** @var Response */ |
56
|
|
|
protected $responseE; |
57
|
|
|
/** @var Response */ |
58
|
|
|
protected $responseF; |
59
|
|
|
/** @var Response */ |
60
|
|
|
protected $responseG; |
61
|
|
|
|
62
|
|
|
/** @var string */ |
63
|
|
|
protected $hrefA; |
64
|
|
|
protected $hrefB; |
65
|
|
|
protected $hrefC; |
66
|
|
|
protected $hrefD; |
67
|
|
|
protected $hrefE; |
68
|
|
|
protected $hrefF; |
69
|
|
|
protected $hrefG; |
70
|
|
|
|
71
|
|
|
/** |
72
|
|
|
* Sets up the fixture, for example, opens a network connection. |
73
|
|
|
* This method is called before a test is executed. |
74
|
|
|
*/ |
75
|
|
|
protected function setUp() |
76
|
|
|
{ |
77
|
|
|
$this->spider = new Spider('http://php-spider.org/A'); |
78
|
|
|
|
79
|
|
|
$this->requestHandler = $this->getMock('VDB\Spider\RequestHandler\RequestHandler'); |
80
|
|
|
|
81
|
|
|
$this->hrefA = 'http://php-spider.org/A'; |
82
|
|
|
$this->hrefB = 'http://php-spider.org/B'; |
83
|
|
|
$this->hrefC = 'http://php-spider.org/C'; |
84
|
|
|
$this->hrefD = 'http://php-spider.org/D'; |
85
|
|
|
$this->hrefE = 'http://php-spider.org/E'; |
86
|
|
|
$this->hrefF = 'http://php-spider.org/F'; |
87
|
|
|
$this->hrefG = 'http://php-spider.org/G'; |
88
|
|
|
|
89
|
|
|
$this->linkA = new FilterableUri($this->hrefA); |
90
|
|
|
$this->linkB = new FilterableUri($this->hrefB); |
91
|
|
|
$this->linkC = new FilterableUri($this->hrefC); |
92
|
|
|
$this->linkD = new FilterableUri($this->hrefD); |
93
|
|
|
$this->linkE = new FilterableUri($this->hrefE); |
94
|
|
|
$this->linkF = new FilterableUri($this->hrefF); |
95
|
|
|
$this->linkG = new FilterableUri($this->hrefG); |
96
|
|
|
|
97
|
|
|
$htmlA = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceA.html'); |
98
|
|
|
$this->responseA = new Response(200, null, $htmlA); |
99
|
|
|
|
100
|
|
|
$htmlB = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceB.html'); |
101
|
|
|
$this->responseB = new Response(200, null, $htmlB); |
102
|
|
|
|
103
|
|
|
$htmlC = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceC.html'); |
104
|
|
|
$this->responseC = new Response(200, null, $htmlC); |
105
|
|
|
|
106
|
|
|
$htmlD = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceD.html'); |
107
|
|
|
$this->responseD = new Response(200, null, $htmlD); |
108
|
|
|
|
109
|
|
|
$htmlE = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceE.html'); |
110
|
|
|
$this->responseE = new Response(200, null, $htmlE); |
111
|
|
|
|
112
|
|
|
$htmlF = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceF.html'); |
113
|
|
|
$this->responseF = new Response(200, null, $htmlF); |
114
|
|
|
|
115
|
|
|
$htmlG = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceG.html'); |
116
|
|
|
$this->responseG = new Response(200, null, $htmlG); |
117
|
|
|
|
118
|
|
|
$this->requestHandler |
119
|
|
|
->expects($this->any()) |
120
|
|
|
->method('request') |
121
|
|
|
->will($this->returnCallback(array($this, 'doTestRequest'))); |
122
|
|
|
|
123
|
|
|
$this->spider->setRequestHandler($this->requestHandler); |
124
|
|
|
|
125
|
|
|
$this->spider->getDiscovererSet()->set(new XPathExpressionDiscoverer('//a')); |
126
|
|
|
|
127
|
|
|
$this->statsHandler = new StatsHandler(); |
128
|
|
|
$this->spider->getDispatcher()->addSubscriber($this->statsHandler); |
129
|
|
|
$this->spider->getQueueManager()->getDispatcher()->addSubscriber($this->statsHandler); |
130
|
|
|
|
131
|
|
|
$this->logHandler = new LogHandler(); |
|
|
|
|
132
|
|
|
$this->spider->getDispatcher()->addSubscriber($this->logHandler); |
133
|
|
|
$this->spider->getQueueManager()->getDispatcher()->addSubscriber($this->logHandler); |
134
|
|
|
} |
135
|
|
|
|
136
|
|
|
/** |
137
|
|
|
* @return Resource |
138
|
|
|
* @throws \ErrorException |
139
|
|
|
*/ |
140
|
|
|
public function doTestRequest() |
141
|
|
|
{ |
142
|
|
|
$link = func_get_arg(0); |
143
|
|
|
|
144
|
|
|
switch ($link->toString()) { |
145
|
|
|
case $this->linkA->toString(): |
146
|
|
|
return $this->getResource($this->linkA, $this->responseA); |
147
|
|
|
case $this->linkB->toString(): |
148
|
|
|
return $this->getResource($this->linkB, $this->responseB); |
149
|
|
|
case $this->linkC->toString(): |
150
|
|
|
return $this->getResource($this->linkC, $this->responseC); |
151
|
|
|
case $this->linkD->toString(): |
152
|
|
|
return $this->getResource($this->linkD, $this->responseD); |
153
|
|
|
case $this->linkE->toString(): |
154
|
|
|
return $this->getResource($this->linkE, $this->responseE); |
155
|
|
|
case $this->linkF->toString(): |
156
|
|
|
return $this->getResource($this->linkF, $this->responseF); |
157
|
|
|
case $this->linkG->toString(): |
158
|
|
|
return $this->getResource($this->linkG, $this->responseG); |
159
|
|
|
default: |
160
|
|
|
throw new \ErrorException('The requested URI was not stubbed: ' . $link->toString()); |
161
|
|
|
} |
162
|
|
|
} |
163
|
|
|
|
164
|
|
|
/** |
165
|
|
|
* @covers VDB\Spider\Spider::crawl |
166
|
|
|
* |
167
|
|
|
* Behaviour as explained here: https://en.wikipedia.org/wiki/Depth-first_search#Example |
168
|
|
|
*/ |
169
|
|
View Code Duplication |
public function testCrawlDFSDefaultBehaviour() |
|
|
|
|
170
|
|
|
{ |
171
|
|
|
$this->spider->getQueueManager()->maxDepth = 10; |
|
|
|
|
172
|
|
|
|
173
|
|
|
$this->spider->crawl(); |
174
|
|
|
|
175
|
|
|
$expected = array( |
176
|
|
|
$this->linkA, |
177
|
|
|
$this->linkE, |
178
|
|
|
$this->linkF, |
179
|
|
|
$this->linkC, |
180
|
|
|
$this->linkG, |
181
|
|
|
$this->linkB, |
182
|
|
|
$this->linkD |
183
|
|
|
); |
184
|
|
|
|
185
|
|
|
$this->assertEquals($expected, $this->statsHandler->getPersisted()); |
186
|
|
|
} |
187
|
|
|
|
188
|
|
|
/** |
189
|
|
|
* @covers VDB\Spider\Spider::crawl |
190
|
|
|
* |
191
|
|
|
*/ |
192
|
|
View Code Duplication |
public function testCrawlBFSDefaultBehaviour() |
|
|
|
|
193
|
|
|
{ |
194
|
|
|
$this->spider->getQueueManager()->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); |
195
|
|
|
$this->spider->getQueueManager()->maxDepth = 1000; |
|
|
|
|
196
|
|
|
|
197
|
|
|
$this->spider->crawl(); |
198
|
|
|
|
199
|
|
|
$expected = array( |
200
|
|
|
$this->linkA, |
201
|
|
|
$this->linkB, |
202
|
|
|
$this->linkC, |
203
|
|
|
$this->linkE, |
204
|
|
|
$this->linkD, |
205
|
|
|
$this->linkF, |
206
|
|
|
$this->linkG |
207
|
|
|
); |
208
|
|
|
|
209
|
|
|
$this->assertEquals($expected, $this->statsHandler->getPersisted()); |
210
|
|
|
} |
211
|
|
|
|
212
|
|
|
/** |
213
|
|
|
* @covers VDB\Spider\Spider::crawl |
214
|
|
|
* |
215
|
|
|
* Behaviour as explained here: https://en.wikipedia.org/wiki/Depth-first_search#Example |
216
|
|
|
*/ |
217
|
|
|
public function testCrawlDFSMaxDepthOne() |
218
|
|
|
{ |
219
|
|
|
$this->spider->getQueueManager()->maxDepth = 1; |
|
|
|
|
220
|
|
|
|
221
|
|
|
$this->spider->crawl(); |
222
|
|
|
|
223
|
|
|
$expected = array( |
224
|
|
|
$this->linkA, |
225
|
|
|
$this->linkE, |
226
|
|
|
$this->linkC, |
227
|
|
|
$this->linkB, |
228
|
|
|
); |
229
|
|
|
|
230
|
|
|
$this->assertEquals($expected, $this->statsHandler->getPersisted()); |
231
|
|
|
} |
232
|
|
|
|
233
|
|
|
public function testCrawlBFSMaxDepthOne() |
234
|
|
|
{ |
235
|
|
|
$this->spider->getQueueManager()->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); |
236
|
|
|
$this->spider->getQueueManager()->maxDepth = 1; |
|
|
|
|
237
|
|
|
|
238
|
|
|
$this->spider->crawl(); |
239
|
|
|
|
240
|
|
|
$expected = array( |
241
|
|
|
$this->linkA, |
242
|
|
|
$this->linkB, |
243
|
|
|
$this->linkC, |
244
|
|
|
$this->linkE, |
245
|
|
|
); |
246
|
|
|
|
247
|
|
|
$this->assertEquals($expected, $this->statsHandler->getPersisted()); |
248
|
|
|
} |
249
|
|
|
|
250
|
|
|
/** |
251
|
|
|
* @covers VDB\Spider\Spider::crawl |
252
|
|
|
*/ |
253
|
|
View Code Duplication |
public function testCrawlDFSMaxQueueSize() |
|
|
|
|
254
|
|
|
{ |
255
|
|
|
$this->spider->getQueueManager()->maxDepth = 1000; |
|
|
|
|
256
|
|
|
$this->spider->downloadLimit = 3; |
|
|
|
|
257
|
|
|
|
258
|
|
|
$this->spider->crawl(); |
259
|
|
|
|
260
|
|
|
$expected = array( |
261
|
|
|
$this->linkA, |
262
|
|
|
$this->linkE, |
263
|
|
|
$this->linkF, |
264
|
|
|
); |
265
|
|
|
|
266
|
|
|
$this->assertEquals($expected, $this->statsHandler->getPersisted()); |
267
|
|
|
} |
268
|
|
|
|
269
|
|
View Code Duplication |
public function testCrawlBFSMaxQueueSize() |
|
|
|
|
270
|
|
|
{ |
271
|
|
|
$this->spider->getQueueManager()->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); |
272
|
|
|
$this->spider->getQueueManager()->maxDepth = 1000; |
|
|
|
|
273
|
|
|
$this->spider->downloadLimit = 3; |
|
|
|
|
274
|
|
|
|
275
|
|
|
$this->spider->crawl(); |
276
|
|
|
|
277
|
|
|
$expected = array( |
278
|
|
|
$this->linkA, |
279
|
|
|
$this->linkB, |
280
|
|
|
$this->linkC, |
281
|
|
|
); |
282
|
|
|
|
283
|
|
|
$this->assertEquals($expected, $this->statsHandler->getPersisted()); |
284
|
|
|
} |
285
|
|
|
|
286
|
|
|
/** |
287
|
|
|
* @covers VDB\Spider\Spider::crawl |
288
|
|
|
*/ |
289
|
|
|
public function testCrawlFailedRequest() |
290
|
|
|
{ |
291
|
|
|
$this->requestHandler |
292
|
|
|
->expects($this->any()) |
293
|
|
|
->method('request') |
294
|
|
|
->will( |
295
|
|
|
$this->throwException(new Exception('Failed mock request!')) |
296
|
|
|
); |
297
|
|
|
|
298
|
|
|
$this->spider->crawl(); |
299
|
|
|
$stats = $this->statsHandler; |
300
|
|
|
|
301
|
|
|
$this->assertCount(0, $stats->getFiltered(), 'Filtered count'); |
302
|
|
|
$this->assertCount(0, $stats->getPersisted(), 'Persisted count'); |
303
|
|
|
$this->assertCount(1, $stats->getFailed(), 'Failed count'); |
304
|
|
|
} |
305
|
|
|
} |
306
|
|
|
|
In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:
Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion: