1
|
|
|
<?php |
2
|
|
|
namespace VDB\Spider; |
3
|
|
|
|
4
|
|
|
use Exception; |
5
|
|
|
use Guzzle\Http\Message\Response; |
6
|
|
|
use PHPUnit_Framework_MockObject_MockObject; |
7
|
|
|
use VDB\Spider\Discoverer\XPathExpressionDiscoverer; |
8
|
|
|
use VDB\Spider\Tests\TestCase; |
9
|
|
|
use VDB\Spider\QueueManager\InMemoryQueueManager; |
10
|
|
|
use VDB\Spider\StatsHandler; |
11
|
|
|
use VDB\Spider\Uri\FilterableUri; |
12
|
|
|
|
13
|
|
|
/** |
14
|
|
|
*/ |
15
|
|
|
class SpiderTest extends TestCase |
16
|
|
|
{ |
17
|
|
|
/** |
18
|
|
|
* @var Spider |
19
|
|
|
*/ |
20
|
|
|
protected $spider; |
21
|
|
|
|
22
|
|
|
/** |
23
|
|
|
* @var logHandler |
24
|
|
|
*/ |
25
|
|
|
protected $logHandler; |
26
|
|
|
|
27
|
|
|
/** |
28
|
|
|
* @var StatsHandler |
29
|
|
|
*/ |
30
|
|
|
protected $statsHandler; |
31
|
|
|
|
32
|
|
|
/** |
33
|
|
|
* @var PHPUnit_Framework_MockObject_MockObject |
34
|
|
|
*/ |
35
|
|
|
protected $requestHandler; |
36
|
|
|
|
37
|
|
|
/** @var FilterableUri */ |
38
|
|
|
protected $linkA; |
39
|
|
|
/** @var FilterableUri */ |
40
|
|
|
protected $linkB; |
41
|
|
|
/** @var FilterableUri */ |
42
|
|
|
protected $linkC; |
43
|
|
|
/** @var FilterableUri */ |
44
|
|
|
protected $linkD; |
45
|
|
|
/** @var FilterableUri */ |
46
|
|
|
protected $linkE; |
47
|
|
|
/** @var FilterableUri */ |
48
|
|
|
protected $linkF; |
49
|
|
|
/** @var FilterableUri */ |
50
|
|
|
protected $linkG; |
51
|
|
|
|
52
|
|
|
/** @var Response */ |
53
|
|
|
protected $responseA; |
54
|
|
|
/** @var Response */ |
55
|
|
|
protected $responseB; |
56
|
|
|
/** @var Response */ |
57
|
|
|
protected $responseC; |
58
|
|
|
/** @var Response */ |
59
|
|
|
protected $responseD; |
60
|
|
|
/** @var Response */ |
61
|
|
|
protected $responseE; |
62
|
|
|
/** @var Response */ |
63
|
|
|
protected $responseF; |
64
|
|
|
/** @var Response */ |
65
|
|
|
protected $responseG; |
66
|
|
|
|
67
|
|
|
/** @var string */ |
68
|
|
|
protected $hrefA; |
69
|
|
|
protected $hrefB; |
70
|
|
|
protected $hrefC; |
71
|
|
|
protected $hrefD; |
72
|
|
|
protected $hrefE; |
73
|
|
|
protected $hrefF; |
74
|
|
|
protected $hrefG; |
75
|
|
|
|
76
|
|
|
/** |
77
|
|
|
* Sets up the fixture, for example, opens a network connection. |
78
|
|
|
* This method is called before a test is executed. |
79
|
|
|
* |
80
|
|
|
* Setting up the following structure: |
81
|
|
|
* |
82
|
|
|
* 0: A |
83
|
|
|
* /|\ |
84
|
|
|
* 1: B C E |
85
|
|
|
* /| | | |
86
|
|
|
* 2: D F G | |
87
|
|
|
* | _ | |
88
|
|
|
* |
89
|
|
|
* Note: E links to F. |
90
|
|
|
*/ |
91
|
|
|
protected function setUp() |
92
|
|
|
{ |
93
|
|
|
$this->spider = new Spider('http://php-spider.org/A'); |
94
|
|
|
|
95
|
|
|
$this->requestHandler = $this->getMock('VDB\Spider\RequestHandler\RequestHandlerInterface'); |
96
|
|
|
|
97
|
|
|
$this->hrefA = 'http://php-spider.org/A'; |
98
|
|
|
$this->hrefB = 'http://php-spider.org/B'; |
99
|
|
|
$this->hrefC = 'http://php-spider.org/C'; |
100
|
|
|
$this->hrefD = 'http://php-spider.org/D'; |
101
|
|
|
$this->hrefE = 'http://php-spider.org/E'; |
102
|
|
|
$this->hrefF = 'http://php-spider.org/F'; |
103
|
|
|
$this->hrefG = 'http://php-spider.org/G'; |
104
|
|
|
|
105
|
|
|
$this->linkA = new FilterableUri($this->hrefA); |
106
|
|
|
$this->linkB = new FilterableUri($this->hrefB); |
107
|
|
|
$this->linkC = new FilterableUri($this->hrefC); |
108
|
|
|
$this->linkD = new FilterableUri($this->hrefD); |
109
|
|
|
$this->linkE = new FilterableUri($this->hrefE); |
110
|
|
|
$this->linkF = new FilterableUri($this->hrefF); |
111
|
|
|
$this->linkG = new FilterableUri($this->hrefG); |
112
|
|
|
|
113
|
|
|
$this->linkA->setDepthFound(0); |
114
|
|
|
$this->linkB->setDepthFound(1); |
115
|
|
|
$this->linkC->setDepthFound(1); |
116
|
|
|
$this->linkD->setDepthFound(2); |
117
|
|
|
$this->linkE->setDepthFound(1); |
118
|
|
|
$this->linkF->setDepthFound(2); |
119
|
|
|
$this->linkG->setDepthFound(2); |
120
|
|
|
|
121
|
|
|
$htmlA = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceA.html'); |
122
|
|
|
$this->responseA = new Response(200, null, $htmlA); |
123
|
|
|
|
124
|
|
|
$htmlB = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceB.html'); |
125
|
|
|
$this->responseB = new Response(200, null, $htmlB); |
126
|
|
|
|
127
|
|
|
$htmlC = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceC.html'); |
128
|
|
|
$this->responseC = new Response(200, null, $htmlC); |
129
|
|
|
|
130
|
|
|
$htmlD = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceD.html'); |
131
|
|
|
$this->responseD = new Response(200, null, $htmlD); |
132
|
|
|
|
133
|
|
|
$htmlE = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceE.html'); |
134
|
|
|
$this->responseE = new Response(200, null, $htmlE); |
135
|
|
|
|
136
|
|
|
$htmlF = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceF.html'); |
137
|
|
|
$this->responseF = new Response(200, null, $htmlF); |
138
|
|
|
|
139
|
|
|
$htmlG = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceG.html'); |
140
|
|
|
$this->responseG = new Response(200, null, $htmlG); |
141
|
|
|
|
142
|
|
|
$this->requestHandler |
143
|
|
|
->expects($this->any()) |
144
|
|
|
->method('request') |
145
|
|
|
->will($this->returnCallback(array($this, 'doTestRequest'))); |
146
|
|
|
|
147
|
|
|
$this->spider->setRequestHandler($this->requestHandler); |
148
|
|
|
|
149
|
|
|
$this->spider->getDiscovererSet()->set(new XPathExpressionDiscoverer('//a')); |
150
|
|
|
|
151
|
|
|
$this->statsHandler = new StatsHandler(); |
152
|
|
|
$this->spider->getDispatcher()->addSubscriber($this->statsHandler); |
153
|
|
|
$this->spider->getQueueManager()->getDispatcher()->addSubscriber($this->statsHandler); |
154
|
|
|
|
155
|
|
|
$this->logHandler = new LogHandler(); |
156
|
|
|
$this->spider->getDispatcher()->addSubscriber($this->logHandler); |
157
|
|
|
$this->spider->getQueueManager()->getDispatcher()->addSubscriber($this->logHandler); |
158
|
|
|
} |
159
|
|
|
|
160
|
|
|
/** |
161
|
|
|
* @return Resource |
162
|
|
|
* @throws \ErrorException |
163
|
|
|
*/ |
164
|
|
|
public function doTestRequest() |
165
|
|
|
{ |
166
|
|
|
$link = func_get_arg(0); |
167
|
|
|
|
168
|
|
|
switch ($link->toString()) { |
169
|
|
|
case $this->linkA->toString(): |
170
|
|
|
return $this->getResource($this->linkA, $this->responseA); |
171
|
|
|
case $this->linkB->toString(): |
172
|
|
|
return $this->getResource($this->linkB, $this->responseB); |
173
|
|
|
case $this->linkC->toString(): |
174
|
|
|
return $this->getResource($this->linkC, $this->responseC); |
175
|
|
|
case $this->linkD->toString(): |
176
|
|
|
return $this->getResource($this->linkD, $this->responseD); |
177
|
|
|
case $this->linkE->toString(): |
178
|
|
|
return $this->getResource($this->linkE, $this->responseE); |
179
|
|
|
case $this->linkF->toString(): |
180
|
|
|
return $this->getResource($this->linkF, $this->responseF); |
181
|
|
|
case $this->linkG->toString(): |
182
|
|
|
return $this->getResource($this->linkG, $this->responseG); |
183
|
|
|
default: |
184
|
|
|
throw new \ErrorException('The requested URI was not stubbed: ' . $link->toString()); |
185
|
|
|
} |
186
|
|
|
} |
187
|
|
|
|
188
|
|
|
/** |
189
|
|
|
* @covers VDB\Spider\Spider::crawl |
190
|
|
|
* |
191
|
|
|
* Behaviour as explained here: https://en.wikipedia.org/wiki/Depth-first_search#Example |
192
|
|
|
*/ |
193
|
|
View Code Duplication |
public function testCrawlDFSDefaultBehaviour() |
|
|
|
|
194
|
|
|
{ |
195
|
|
|
$this->spider->getDiscovererSet()->maxDepth = 10; |
196
|
|
|
|
197
|
|
|
$this->spider->crawl(); |
198
|
|
|
|
199
|
|
|
$expected = array( |
200
|
|
|
$this->linkA, |
201
|
|
|
$this->linkE, |
202
|
|
|
$this->linkF, |
203
|
|
|
$this->linkC, |
204
|
|
|
$this->linkG, |
205
|
|
|
$this->linkB, |
206
|
|
|
$this->linkD |
207
|
|
|
); |
208
|
|
|
|
209
|
|
|
$this->assertEquals($expected, $this->statsHandler->getPersisted()); |
210
|
|
|
} |
211
|
|
|
|
212
|
|
|
/** |
213
|
|
|
* @covers VDB\Spider\Spider::crawl |
214
|
|
|
* |
215
|
|
|
*/ |
216
|
|
View Code Duplication |
public function testCrawlBFSDefaultBehaviour() |
|
|
|
|
217
|
|
|
{ |
218
|
|
|
$this->spider->getQueueManager()->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); |
219
|
|
|
$this->spider->getDiscovererSet()->maxDepth = 1000; |
220
|
|
|
|
221
|
|
|
$this->spider->crawl(); |
222
|
|
|
|
223
|
|
|
$expected = array( |
224
|
|
|
$this->linkA, |
225
|
|
|
$this->linkB, |
226
|
|
|
$this->linkC, |
227
|
|
|
$this->linkE, |
228
|
|
|
$this->linkD, |
229
|
|
|
$this->linkF, |
230
|
|
|
$this->linkG |
231
|
|
|
); |
232
|
|
|
|
233
|
|
|
$this->assertEquals($expected, $this->statsHandler->getPersisted()); |
234
|
|
|
} |
235
|
|
|
|
236
|
|
|
/** |
237
|
|
|
* @covers VDB\Spider\Spider::crawl |
238
|
|
|
* |
239
|
|
|
* Behaviour as explained here: https://en.wikipedia.org/wiki/Depth-first_search#Example |
240
|
|
|
* |
241
|
|
|
* Given the following structure: |
242
|
|
|
* |
243
|
|
|
* 0: A |
244
|
|
|
* /|\ |
245
|
|
|
* 1: B C E |
246
|
|
|
* /| | | |
247
|
|
|
* 2: D F G | |
248
|
|
|
* | _ | |
249
|
|
|
* |
250
|
|
|
* We expect the following result: A, E, C, B |
251
|
|
|
* |
252
|
|
|
*/ |
253
|
|
|
public function testCrawlDFSMaxDepthOne() |
254
|
|
|
{ |
255
|
|
|
$this->spider->getDiscovererSet()->maxDepth = 1; |
256
|
|
|
|
257
|
|
|
$this->spider->crawl(); |
258
|
|
|
|
259
|
|
|
$expected = array( |
260
|
|
|
$this->linkA, |
261
|
|
|
$this->linkE, |
262
|
|
|
$this->linkC, |
263
|
|
|
$this->linkB, |
264
|
|
|
); |
265
|
|
|
|
266
|
|
|
$this->assertEquals($expected, $this->statsHandler->getPersisted()); |
267
|
|
|
} |
268
|
|
|
|
269
|
|
|
public function testCrawlBFSMaxDepthOne() |
270
|
|
|
{ |
271
|
|
|
$this->spider->getQueueManager()->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); |
272
|
|
|
$this->spider->getDiscovererSet()->maxDepth = 1; |
273
|
|
|
|
274
|
|
|
$this->spider->crawl(); |
275
|
|
|
|
276
|
|
|
$expected = array( |
277
|
|
|
$this->linkA, |
278
|
|
|
$this->linkB, |
279
|
|
|
$this->linkC, |
280
|
|
|
$this->linkE, |
281
|
|
|
); |
282
|
|
|
|
283
|
|
|
$this->assertEquals($expected, $this->statsHandler->getPersisted()); |
284
|
|
|
} |
285
|
|
|
|
286
|
|
|
/** |
287
|
|
|
* @covers VDB\Spider\Spider::crawl |
288
|
|
|
*/ |
289
|
|
View Code Duplication |
public function testCrawlDFSMaxQueueSize() |
|
|
|
|
290
|
|
|
{ |
291
|
|
|
$this->spider->getDiscovererSet()->maxDepth = 1000; |
292
|
|
|
$this->spider->downloadLimit = 3; |
293
|
|
|
|
294
|
|
|
$this->spider->crawl(); |
295
|
|
|
|
296
|
|
|
$expected = array( |
297
|
|
|
$this->linkA, |
298
|
|
|
$this->linkE, |
299
|
|
|
$this->linkF, |
300
|
|
|
); |
301
|
|
|
|
302
|
|
|
$this->assertEquals($expected, $this->statsHandler->getPersisted()); |
303
|
|
|
} |
304
|
|
|
|
305
|
|
View Code Duplication |
public function testCrawlBFSMaxQueueSize() |
|
|
|
|
306
|
|
|
{ |
307
|
|
|
$this->spider->getQueueManager()->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); |
308
|
|
|
$this->spider->getDiscovererSet()->maxDepth = 1000; |
309
|
|
|
$this->spider->downloadLimit = 3; |
310
|
|
|
|
311
|
|
|
$this->spider->crawl(); |
312
|
|
|
|
313
|
|
|
$expected = array( |
314
|
|
|
$this->linkA, |
315
|
|
|
$this->linkB, |
316
|
|
|
$this->linkC, |
317
|
|
|
); |
318
|
|
|
|
319
|
|
|
$this->assertEquals($expected, $this->statsHandler->getPersisted()); |
320
|
|
|
} |
321
|
|
|
|
322
|
|
|
/** |
323
|
|
|
* @covers VDB\Spider\Spider::crawl |
324
|
|
|
*/ |
325
|
|
|
public function testCrawlFailedRequest() |
326
|
|
|
{ |
327
|
|
|
$this->requestHandler |
328
|
|
|
->expects($this->any()) |
329
|
|
|
->method('request') |
330
|
|
|
->will( |
331
|
|
|
$this->throwException(new Exception('Failed mock request!')) |
332
|
|
|
); |
333
|
|
|
|
334
|
|
|
$this->spider->crawl(); |
335
|
|
|
$stats = $this->statsHandler; |
336
|
|
|
|
337
|
|
|
$this->assertCount(0, $stats->getFiltered(), 'Filtered count'); |
338
|
|
|
$this->assertCount(0, $stats->getPersisted(), 'Persisted count'); |
339
|
|
|
$this->assertCount(1, $stats->getFailed(), 'Failed count'); |
340
|
|
|
} |
341
|
|
|
} |
342
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.