1
|
|
|
<?php |
2
|
|
|
namespace VDB\Spider; |
3
|
|
|
|
4
|
|
|
use Exception; |
5
|
|
|
use GuzzleHttp\Psr7\Response; |
6
|
|
|
use PHPUnit_Framework_MockObject_MockObject; |
7
|
|
|
use VDB\Spider\Discoverer\XPathExpressionDiscoverer; |
8
|
|
|
use VDB\Spider\Tests\TestCase; |
9
|
|
|
use VDB\Spider\QueueManager\InMemoryQueueManager; |
10
|
|
|
use VDB\Spider\Uri\DiscoveredUri; |
11
|
|
|
use VDB\Uri\Uri; |
12
|
|
|
|
13
|
|
|
/** |
14
|
|
|
*/ |
15
|
|
|
class SpiderTest extends TestCase |
16
|
|
|
{ |
17
|
|
|
/** |
18
|
|
|
* @var Spider |
19
|
|
|
*/ |
20
|
|
|
protected $spider; |
21
|
|
|
|
22
|
|
|
/** |
23
|
|
|
* @var PHPUnit_Framework_MockObject_MockObject |
24
|
|
|
*/ |
25
|
|
|
protected $requestHandler; |
26
|
|
|
|
27
|
|
|
/** @var DiscoveredUri */ |
28
|
|
|
protected $linkA; |
29
|
|
|
/** @var DiscoveredUri */ |
30
|
|
|
protected $linkB; |
31
|
|
|
/** @var DiscoveredUri */ |
32
|
|
|
protected $linkC; |
33
|
|
|
/** @var DiscoveredUri */ |
34
|
|
|
protected $linkD; |
35
|
|
|
/** @var DiscoveredUri */ |
36
|
|
|
protected $linkE; |
37
|
|
|
/** @var DiscoveredUri */ |
38
|
|
|
protected $linkF; |
39
|
|
|
/** @var DiscoveredUri */ |
40
|
|
|
protected $linkG; |
41
|
|
|
|
42
|
|
|
/** @var Response */ |
43
|
|
|
protected $responseA; |
44
|
|
|
/** @var Response */ |
45
|
|
|
protected $responseB; |
46
|
|
|
/** @var Response */ |
47
|
|
|
protected $responseC; |
48
|
|
|
/** @var Response */ |
49
|
|
|
protected $responseD; |
50
|
|
|
/** @var Response */ |
51
|
|
|
protected $responseE; |
52
|
|
|
/** @var Response */ |
53
|
|
|
protected $responseF; |
54
|
|
|
/** @var Response */ |
55
|
|
|
protected $responseG; |
56
|
|
|
|
57
|
|
|
/** @var string */ |
58
|
|
|
protected $hrefA; |
59
|
|
|
protected $hrefB; |
60
|
|
|
protected $hrefC; |
61
|
|
|
protected $hrefD; |
62
|
|
|
protected $hrefE; |
63
|
|
|
protected $hrefF; |
64
|
|
|
protected $hrefG; |
65
|
|
|
|
66
|
|
|
/** |
67
|
|
|
* @var array An associative array, containing a map of $this->linkX to $this->responseX. |
68
|
|
|
*/ |
69
|
|
|
protected $linkToResponseMap = []; |
70
|
|
|
|
71
|
|
|
/** |
72
|
|
|
* Sets up the fixture, for example, opens a network connection. |
73
|
|
|
* This method is called before a test is executed. |
74
|
|
|
* |
75
|
|
|
* Setting up the following structure: |
76
|
|
|
* |
77
|
|
|
* 0: A |
78
|
|
|
* /|\ |
79
|
|
|
* 1: B C E |
80
|
|
|
* /| | | |
81
|
|
|
* 2: D F G | |
82
|
|
|
* | _ | |
83
|
|
|
* |
84
|
|
|
* Note: E links to F. |
85
|
|
|
*/ |
86
|
|
|
protected function setUp() |
87
|
|
|
{ |
88
|
|
|
$this->spider = new Spider('http://php-spider.org/A'); |
89
|
|
|
|
90
|
|
|
$this->requestHandler = $this->getMockBuilder('VDB\Spider\RequestHandler\RequestHandlerInterface')->getMock(); |
91
|
|
|
|
92
|
|
|
$this->hrefA = 'http://php-spider.org/A'; |
93
|
|
|
$this->hrefB = 'http://php-spider.org/B'; |
94
|
|
|
$this->hrefC = 'http://php-spider.org/C'; |
95
|
|
|
$this->hrefD = 'http://php-spider.org/D'; |
96
|
|
|
$this->hrefE = 'http://php-spider.org/E'; |
97
|
|
|
$this->hrefF = 'http://php-spider.org/F'; |
98
|
|
|
$this->hrefG = 'http://php-spider.org/G'; |
99
|
|
|
|
100
|
|
|
$this->linkA = new DiscoveredUri(new Uri($this->hrefA)); |
101
|
|
|
$this->linkB = new DiscoveredUri(new Uri($this->hrefB)); |
102
|
|
|
$this->linkC = new DiscoveredUri(new Uri($this->hrefC)); |
103
|
|
|
$this->linkD = new DiscoveredUri(new Uri($this->hrefD)); |
104
|
|
|
$this->linkE = new DiscoveredUri(new Uri($this->hrefE)); |
105
|
|
|
$this->linkF = new DiscoveredUri(new Uri($this->hrefF)); |
106
|
|
|
$this->linkG = new DiscoveredUri(new Uri($this->hrefG)); |
107
|
|
|
|
108
|
|
|
$this->linkA->setDepthFound(0); |
109
|
|
|
$this->linkB->setDepthFound(1); |
110
|
|
|
$this->linkC->setDepthFound(1); |
111
|
|
|
$this->linkD->setDepthFound(2); |
112
|
|
|
$this->linkE->setDepthFound(1); |
113
|
|
|
$this->linkF->setDepthFound(2); |
114
|
|
|
$this->linkG->setDepthFound(2); |
115
|
|
|
|
116
|
|
|
$htmlA = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceA.html'); |
117
|
|
|
$this->responseA = new Response(200, [], $htmlA); |
118
|
|
|
|
119
|
|
|
$htmlB = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceB.html'); |
120
|
|
|
$this->responseB = new Response(200, [], $htmlB); |
121
|
|
|
|
122
|
|
|
$htmlC = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceC.html'); |
123
|
|
|
$this->responseC = new Response(200, [], $htmlC); |
124
|
|
|
|
125
|
|
|
$htmlD = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceD.html'); |
126
|
|
|
$this->responseD = new Response(200, [], $htmlD); |
127
|
|
|
|
128
|
|
|
$htmlE = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceE.html'); |
129
|
|
|
$this->responseE = new Response(200, [], $htmlE); |
130
|
|
|
|
131
|
|
|
$htmlF = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceF.html'); |
132
|
|
|
$this->responseF = new Response(200, [], $htmlF); |
133
|
|
|
|
134
|
|
|
$htmlG = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceG.html'); |
135
|
|
|
$this->responseG = new Response(200, [], $htmlG); |
136
|
|
|
|
137
|
|
|
$this->linkToResponseMap[$this->linkA->toString()] = $this->responseA; |
138
|
|
|
$this->linkToResponseMap[$this->linkB->toString()] = $this->responseB; |
139
|
|
|
$this->linkToResponseMap[$this->linkC->toString()] = $this->responseC; |
140
|
|
|
$this->linkToResponseMap[$this->linkD->toString()] = $this->responseD; |
141
|
|
|
$this->linkToResponseMap[$this->linkE->toString()] = $this->responseE; |
142
|
|
|
$this->linkToResponseMap[$this->linkF->toString()] = $this->responseF; |
143
|
|
|
$this->linkToResponseMap[$this->linkG->toString()] = $this->responseG; |
144
|
|
|
|
145
|
|
|
$this->requestHandler |
146
|
|
|
->expects($this->any()) |
147
|
|
|
->method('request') |
148
|
|
|
->will($this->returnCallback(array($this, 'doTestRequest'))); |
149
|
|
|
|
150
|
|
|
$this->spider->getDownloader()->setRequestHandler($this->requestHandler); |
151
|
|
|
|
152
|
|
|
$this->spider->getDiscovererSet()->set(new XPathExpressionDiscoverer('//a')); |
153
|
|
|
} |
154
|
|
|
|
155
|
|
|
/** |
156
|
|
|
* @return Resource |
157
|
|
|
* @throws \ErrorException |
158
|
|
|
*/ |
159
|
|
|
public function doTestRequest() |
160
|
|
|
{ |
161
|
|
|
$link = func_get_arg(0); |
162
|
|
|
|
163
|
|
|
if (array_key_exists($link->toString(), $this->linkToResponseMap)) { |
164
|
|
|
return $this->getResource($link, $this->linkToResponseMap[$link->toString()]); |
165
|
|
|
} |
166
|
|
|
|
167
|
|
|
throw new \ErrorException('The requested URI was not stubbed: ' . $link->toString()); |
168
|
|
|
} |
169
|
|
|
|
170
|
|
|
/** |
171
|
|
|
* @covers VDB\Spider\Spider |
172
|
|
|
* |
173
|
|
|
* Behaviour as explained here: https://en.wikipedia.org/wiki/Depth-first_search#Example |
174
|
|
|
*/ |
175
|
|
View Code Duplication |
public function testCrawlDFSDefaultBehaviour() |
|
|
|
|
176
|
|
|
{ |
177
|
|
|
$this->spider->getDiscovererSet()->maxDepth = 10; |
178
|
|
|
|
179
|
|
|
$this->spider->crawl(); |
180
|
|
|
|
181
|
|
|
$expected = array( |
182
|
|
|
$this->linkA, |
183
|
|
|
$this->linkE, |
184
|
|
|
$this->linkF, |
185
|
|
|
$this->linkC, |
186
|
|
|
$this->linkG, |
187
|
|
|
$this->linkB, |
188
|
|
|
$this->linkD |
189
|
|
|
); |
190
|
|
|
|
191
|
|
|
$this->compareUriArray($expected, $this->spider->getDownloader()->getPersistenceHandler()); |
192
|
|
|
} |
193
|
|
|
|
194
|
|
|
/** |
195
|
|
|
* @covers VDB\Spider\Spider |
196
|
|
|
*/ |
197
|
|
View Code Duplication |
public function testCrawlBFSDefaultBehaviour() |
|
|
|
|
198
|
|
|
{ |
199
|
|
|
$this->spider->getQueueManager()->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); |
200
|
|
|
$this->spider->getDiscovererSet()->maxDepth = 1000; |
201
|
|
|
|
202
|
|
|
$this->spider->crawl(); |
203
|
|
|
|
204
|
|
|
$expected = array( |
205
|
|
|
$this->linkA, |
206
|
|
|
$this->linkB, |
207
|
|
|
$this->linkC, |
208
|
|
|
$this->linkE, |
209
|
|
|
$this->linkD, |
210
|
|
|
$this->linkF, |
211
|
|
|
$this->linkG |
212
|
|
|
); |
213
|
|
|
|
214
|
|
|
$this->compareUriArray($expected, $this->spider->getDownloader()->getPersistenceHandler()); |
215
|
|
|
} |
216
|
|
|
|
217
|
|
|
private function compareUriArray($expected, $actual) |
218
|
|
|
{ |
219
|
|
|
foreach ($actual as $index => $resource) { |
220
|
|
|
$this->assertEquals($resource->getUri(), $expected[$index]); |
221
|
|
|
} |
222
|
|
|
} |
223
|
|
|
|
224
|
|
|
/** |
225
|
|
|
* @covers VDB\Spider\Spider |
226
|
|
|
* |
227
|
|
|
* Behaviour as explained here: https://en.wikipedia.org/wiki/Depth-first_search#Example |
228
|
|
|
* |
229
|
|
|
* Given the following structure: |
230
|
|
|
* |
231
|
|
|
* 0: A |
232
|
|
|
* /|\ |
233
|
|
|
* 1: B C E |
234
|
|
|
* /| | | |
235
|
|
|
* 2: D F G | |
236
|
|
|
* | _ | |
237
|
|
|
* |
238
|
|
|
* We expect the following result: A, E, C, B |
239
|
|
|
* |
240
|
|
|
*/ |
241
|
|
|
public function testCrawlDFSMaxDepthOne() |
242
|
|
|
{ |
243
|
|
|
$this->spider->getDiscovererSet()->maxDepth = 1; |
244
|
|
|
|
245
|
|
|
$this->spider->crawl(); |
246
|
|
|
|
247
|
|
|
$expected = array( |
248
|
|
|
$this->linkA, |
249
|
|
|
$this->linkE, |
250
|
|
|
$this->linkC, |
251
|
|
|
$this->linkB, |
252
|
|
|
); |
253
|
|
|
|
254
|
|
|
$this->compareUriArray($expected, $this->spider->getDownloader()->getPersistenceHandler()); |
255
|
|
|
} |
256
|
|
|
|
257
|
|
|
/** |
258
|
|
|
* @covers VDB\Spider\Spider |
259
|
|
|
*/ |
260
|
|
View Code Duplication |
public function testCrawlBFSMaxDepthOne() |
|
|
|
|
261
|
|
|
{ |
262
|
|
|
$this->spider->getQueueManager()->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); |
263
|
|
|
$this->spider->getDiscovererSet()->maxDepth = 1; |
264
|
|
|
|
265
|
|
|
$this->spider->crawl(); |
266
|
|
|
|
267
|
|
|
$expected = array( |
268
|
|
|
$this->linkA, |
269
|
|
|
$this->linkB, |
270
|
|
|
$this->linkC, |
271
|
|
|
$this->linkE, |
272
|
|
|
); |
273
|
|
|
|
274
|
|
|
$this->compareUriArray($expected, $this->spider->getDownloader()->getPersistenceHandler()); |
275
|
|
|
} |
276
|
|
|
|
277
|
|
|
/** |
278
|
|
|
* @covers VDB\Spider\Spider |
279
|
|
|
*/ |
280
|
|
View Code Duplication |
public function testCrawlDFSMaxQueueSize() |
|
|
|
|
281
|
|
|
{ |
282
|
|
|
$this->spider->getDiscovererSet()->maxDepth = 1000; |
283
|
|
|
$this->spider->getDownloader()->setDownloadLimit(3); |
284
|
|
|
|
285
|
|
|
$this->spider->crawl(); |
286
|
|
|
|
287
|
|
|
$expected = array( |
288
|
|
|
$this->linkA, |
289
|
|
|
$this->linkE, |
290
|
|
|
$this->linkF, |
291
|
|
|
); |
292
|
|
|
|
293
|
|
|
$this->compareUriArray($expected, $this->spider->getDownloader()->getPersistenceHandler()); |
294
|
|
|
} |
295
|
|
|
|
296
|
|
|
/** |
297
|
|
|
* @covers VDB\Spider\Spider |
298
|
|
|
*/ |
299
|
|
View Code Duplication |
public function testCrawlBFSMaxQueueSize() |
|
|
|
|
300
|
|
|
{ |
301
|
|
|
$this->spider->getQueueManager()->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); |
302
|
|
|
$this->spider->getDiscovererSet()->maxDepth = 1000; |
303
|
|
|
$this->spider->getDownloader()->setDownloadLimit(3); |
304
|
|
|
|
305
|
|
|
$this->spider->crawl(); |
306
|
|
|
|
307
|
|
|
$expected = array( |
308
|
|
|
$this->linkA, |
309
|
|
|
$this->linkB, |
310
|
|
|
$this->linkC, |
311
|
|
|
); |
312
|
|
|
|
313
|
|
|
$this->compareUriArray($expected, $this->spider->getDownloader()->getPersistenceHandler()); |
314
|
|
|
} |
315
|
|
|
|
316
|
|
|
/** |
317
|
|
|
* @covers VDB\Spider\Spider |
318
|
|
|
*/ |
319
|
|
|
public function testCrawlFailedRequest() |
320
|
|
|
{ |
321
|
|
|
$this->requestHandler |
322
|
|
|
->expects($this->any()) |
323
|
|
|
->method('request') |
324
|
|
|
->will( |
325
|
|
|
$this->throwException(new Exception('Failed mock request!')) |
326
|
|
|
); |
327
|
|
|
|
328
|
|
|
$this->spider->crawl(); |
329
|
|
|
|
330
|
|
|
$this->assertCount(0, $this->spider->getDownloader()->getPersistenceHandler(), 'Persisted count'); |
331
|
|
|
} |
332
|
|
|
|
333
|
|
|
/** |
334
|
|
|
* @covers VDB\Spider\Spider |
335
|
|
|
* @covers VDB\Spider\Downloader\Downloader::getDispatcher |
336
|
|
|
*/ |
337
|
|
|
public function testDownloaderEventDispatcher() |
338
|
|
|
{ |
339
|
|
|
$this->assertSame( |
340
|
|
|
$this->spider->getDispatcher(), |
341
|
|
|
$this->spider->getDownloader()->getDispatcher(), |
342
|
|
|
'Default Spider dispatcher is the same as default Downloader dispatcher' |
343
|
|
|
); |
344
|
|
|
} |
345
|
|
|
|
346
|
|
|
/** |
347
|
|
|
* @covers VDB\Spider\Spider |
348
|
|
|
* @covers VDB\Spider\QueueManager\InMemoryQueueManager::getDispatcher |
349
|
|
|
*/ |
350
|
|
|
public function testQueueManagerEventDispatcher() |
351
|
|
|
{ |
352
|
|
|
$this->assertSame( |
353
|
|
|
$this->spider->getDispatcher(), |
354
|
|
|
$this->spider->getQueueManager()->getDispatcher(), |
355
|
|
|
'Default Spider dispatcher is the same as default Queue manager dispatcher' |
356
|
|
|
); |
357
|
|
|
} |
358
|
|
|
} |
359
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.