These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | namespace VDB\Spider; |
||
3 | |||
4 | use Exception; |
||
5 | use Guzzle\Http\Message\Response; |
||
6 | use PHPUnit_Framework_MockObject_MockObject; |
||
7 | use VDB\Spider\Discoverer\XPathExpressionDiscoverer; |
||
8 | use VDB\Spider\Tests\TestCase; |
||
9 | use VDB\Spider\QueueManager\InMemoryQueueManager; |
||
10 | use VDB\Spider\StatsHandler; |
||
11 | use VDB\Spider\Uri\DiscoveredUri; |
||
12 | use VDB\Uri\Uri; |
||
13 | |||
14 | /** |
||
15 | */ |
||
16 | class SpiderTest extends TestCase |
||
17 | { |
||
18 | /** |
||
19 | * @var Spider |
||
20 | */ |
||
21 | protected $spider; |
||
22 | |||
23 | /** |
||
24 | * @var logHandler |
||
25 | */ |
||
26 | protected $logHandler; |
||
27 | |||
28 | /** |
||
29 | * @var StatsHandler |
||
30 | */ |
||
31 | protected $statsHandler; |
||
32 | |||
33 | /** |
||
34 | * @var PHPUnit_Framework_MockObject_MockObject |
||
35 | */ |
||
36 | protected $requestHandler; |
||
37 | |||
38 | /** @var DiscoveredUri */ |
||
39 | protected $linkA; |
||
40 | /** @var DiscoveredUri */ |
||
41 | protected $linkB; |
||
42 | /** @var DiscoveredUri */ |
||
43 | protected $linkC; |
||
44 | /** @var DiscoveredUri */ |
||
45 | protected $linkD; |
||
46 | /** @var DiscoveredUri */ |
||
47 | protected $linkE; |
||
48 | /** @var DiscoveredUri */ |
||
49 | protected $linkF; |
||
50 | /** @var DiscoveredUri */ |
||
51 | protected $linkG; |
||
52 | |||
53 | /** @var Response */ |
||
54 | protected $responseA; |
||
55 | /** @var Response */ |
||
56 | protected $responseB; |
||
57 | /** @var Response */ |
||
58 | protected $responseC; |
||
59 | /** @var Response */ |
||
60 | protected $responseD; |
||
61 | /** @var Response */ |
||
62 | protected $responseE; |
||
63 | /** @var Response */ |
||
64 | protected $responseF; |
||
65 | /** @var Response */ |
||
66 | protected $responseG; |
||
67 | |||
68 | /** @var string */ |
||
69 | protected $hrefA; |
||
70 | protected $hrefB; |
||
71 | protected $hrefC; |
||
72 | protected $hrefD; |
||
73 | protected $hrefE; |
||
74 | protected $hrefF; |
||
75 | protected $hrefG; |
||
76 | |||
77 | /** |
||
78 | * Sets up the fixture, for example, opens a network connection. |
||
79 | * This method is called before a test is executed. |
||
80 | * |
||
81 | * Setting up the following structure: |
||
82 | * |
||
83 | * 0: A |
||
84 | * /|\ |
||
85 | * 1: B C E |
||
86 | * /| | | |
||
87 | * 2: D F G | |
||
88 | * | _ | |
||
89 | * |
||
90 | * Note: E links to F. |
||
91 | */ |
||
92 | protected function setUp() |
||
93 | { |
||
94 | $this->spider = new Spider('http://php-spider.org/A'); |
||
95 | |||
96 | $this->requestHandler = $this->getMock('VDB\Spider\RequestHandler\RequestHandlerInterface'); |
||
97 | |||
98 | $this->hrefA = 'http://php-spider.org/A'; |
||
99 | $this->hrefB = 'http://php-spider.org/B'; |
||
100 | $this->hrefC = 'http://php-spider.org/C'; |
||
101 | $this->hrefD = 'http://php-spider.org/D'; |
||
102 | $this->hrefE = 'http://php-spider.org/E'; |
||
103 | $this->hrefF = 'http://php-spider.org/F'; |
||
104 | $this->hrefG = 'http://php-spider.org/G'; |
||
105 | |||
106 | $this->linkA = new DiscoveredUri(new Uri($this->hrefA)); |
||
107 | $this->linkB = new DiscoveredUri(new Uri($this->hrefB)); |
||
108 | $this->linkC = new DiscoveredUri(new Uri($this->hrefC)); |
||
109 | $this->linkD = new DiscoveredUri(new Uri($this->hrefD)); |
||
110 | $this->linkE = new DiscoveredUri(new Uri($this->hrefE)); |
||
111 | $this->linkF = new DiscoveredUri(new Uri($this->hrefF)); |
||
112 | $this->linkG = new DiscoveredUri(new Uri($this->hrefG)); |
||
113 | |||
114 | $this->linkA->setDepthFound(0); |
||
115 | $this->linkB->setDepthFound(1); |
||
116 | $this->linkC->setDepthFound(1); |
||
117 | $this->linkD->setDepthFound(2); |
||
118 | $this->linkE->setDepthFound(1); |
||
119 | $this->linkF->setDepthFound(2); |
||
120 | $this->linkG->setDepthFound(2); |
||
121 | |||
122 | $htmlA = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceA.html'); |
||
123 | $this->responseA = new Response(200, null, $htmlA); |
||
124 | |||
125 | $htmlB = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceB.html'); |
||
126 | $this->responseB = new Response(200, null, $htmlB); |
||
127 | |||
128 | $htmlC = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceC.html'); |
||
129 | $this->responseC = new Response(200, null, $htmlC); |
||
130 | |||
131 | $htmlD = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceD.html'); |
||
132 | $this->responseD = new Response(200, null, $htmlD); |
||
133 | |||
134 | $htmlE = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceE.html'); |
||
135 | $this->responseE = new Response(200, null, $htmlE); |
||
136 | |||
137 | $htmlF = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceF.html'); |
||
138 | $this->responseF = new Response(200, null, $htmlF); |
||
139 | |||
140 | $htmlG = file_get_contents(__DIR__ . '/Fixtures/SpiderTestHTMLResourceG.html'); |
||
141 | $this->responseG = new Response(200, null, $htmlG); |
||
142 | |||
143 | $this->requestHandler |
||
144 | ->expects($this->any()) |
||
145 | ->method('request') |
||
146 | ->will($this->returnCallback(array($this, 'doTestRequest'))); |
||
147 | |||
148 | $this->spider->setRequestHandler($this->requestHandler); |
||
149 | |||
150 | $this->spider->getDiscovererSet()->set(new XPathExpressionDiscoverer('//a')); |
||
151 | |||
152 | $this->statsHandler = new StatsHandler(); |
||
153 | $this->spider->getDispatcher()->addSubscriber($this->statsHandler); |
||
154 | $this->spider->getQueueManager()->getDispatcher()->addSubscriber($this->statsHandler); |
||
155 | |||
156 | $this->logHandler = new LogHandler(); |
||
157 | $this->spider->getDispatcher()->addSubscriber($this->logHandler); |
||
158 | $this->spider->getQueueManager()->getDispatcher()->addSubscriber($this->logHandler); |
||
159 | } |
||
160 | |||
161 | /** |
||
162 | * @return Resource |
||
163 | * @throws \ErrorException |
||
164 | */ |
||
165 | public function doTestRequest() |
||
166 | { |
||
167 | $link = func_get_arg(0); |
||
168 | |||
169 | switch ($link->toString()) { |
||
170 | case $this->linkA->toString(): |
||
171 | return $this->getResource($this->linkA, $this->responseA); |
||
172 | case $this->linkB->toString(): |
||
173 | return $this->getResource($this->linkB, $this->responseB); |
||
174 | case $this->linkC->toString(): |
||
175 | return $this->getResource($this->linkC, $this->responseC); |
||
176 | case $this->linkD->toString(): |
||
177 | return $this->getResource($this->linkD, $this->responseD); |
||
178 | case $this->linkE->toString(): |
||
179 | return $this->getResource($this->linkE, $this->responseE); |
||
180 | case $this->linkF->toString(): |
||
181 | return $this->getResource($this->linkF, $this->responseF); |
||
182 | case $this->linkG->toString(): |
||
183 | return $this->getResource($this->linkG, $this->responseG); |
||
184 | default: |
||
185 | throw new \ErrorException('The requested URI was not stubbed: ' . $link->toString()); |
||
186 | } |
||
187 | } |
||
188 | |||
189 | /** |
||
190 | * @covers VDB\Spider\Spider::crawl |
||
191 | * |
||
192 | * Behaviour as explained here: https://en.wikipedia.org/wiki/Depth-first_search#Example |
||
193 | */ |
||
194 | public function testCrawlDFSDefaultBehaviour() |
||
1 ignored issue
–
show
|
|||
195 | { |
||
196 | $this->spider->getDiscovererSet()->maxDepth = 10; |
||
197 | |||
198 | $this->spider->crawl(); |
||
199 | |||
200 | $expected = array( |
||
201 | $this->linkA, |
||
202 | $this->linkE, |
||
203 | $this->linkF, |
||
204 | $this->linkC, |
||
205 | $this->linkG, |
||
206 | $this->linkB, |
||
207 | $this->linkD |
||
208 | ); |
||
209 | |||
210 | $this->assertEquals($expected, $this->statsHandler->getPersisted()); |
||
211 | } |
||
212 | |||
213 | /** |
||
214 | * @covers VDB\Spider\Spider::crawl |
||
215 | * |
||
216 | */ |
||
217 | public function testCrawlBFSDefaultBehaviour() |
||
1 ignored issue
–
show
This method seems to be duplicated in your project.
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository.
Loading history...
|
|||
218 | { |
||
219 | $this->spider->getQueueManager()->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); |
||
220 | $this->spider->getDiscovererSet()->maxDepth = 1000; |
||
221 | |||
222 | $this->spider->crawl(); |
||
223 | |||
224 | $expected = array( |
||
225 | $this->linkA, |
||
226 | $this->linkB, |
||
227 | $this->linkC, |
||
228 | $this->linkE, |
||
229 | $this->linkD, |
||
230 | $this->linkF, |
||
231 | $this->linkG |
||
232 | ); |
||
233 | |||
234 | $this->assertEquals($expected, $this->statsHandler->getPersisted()); |
||
235 | } |
||
236 | |||
237 | /** |
||
238 | * @covers VDB\Spider\Spider::crawl |
||
239 | * |
||
240 | * Behaviour as explained here: https://en.wikipedia.org/wiki/Depth-first_search#Example |
||
241 | * |
||
242 | * Given the following structure: |
||
243 | * |
||
244 | * 0: A |
||
245 | * /|\ |
||
246 | * 1: B C E |
||
247 | * /| | | |
||
248 | * 2: D F G | |
||
249 | * | _ | |
||
250 | * |
||
251 | * We expect the following result: A, E, C, B |
||
252 | * |
||
253 | */ |
||
254 | public function testCrawlDFSMaxDepthOne() |
||
255 | { |
||
256 | $this->spider->getDiscovererSet()->maxDepth = 1; |
||
257 | |||
258 | $this->spider->crawl(); |
||
259 | |||
260 | $expected = array( |
||
261 | $this->linkA, |
||
262 | $this->linkE, |
||
263 | $this->linkC, |
||
264 | $this->linkB, |
||
265 | ); |
||
266 | |||
267 | $this->assertEquals($expected, $this->statsHandler->getPersisted()); |
||
268 | } |
||
269 | |||
270 | public function testCrawlBFSMaxDepthOne() |
||
271 | { |
||
272 | $this->spider->getQueueManager()->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); |
||
273 | $this->spider->getDiscovererSet()->maxDepth = 1; |
||
274 | |||
275 | $this->spider->crawl(); |
||
276 | |||
277 | $expected = array( |
||
278 | $this->linkA, |
||
279 | $this->linkB, |
||
280 | $this->linkC, |
||
281 | $this->linkE, |
||
282 | ); |
||
283 | |||
284 | $this->assertEquals($expected, $this->statsHandler->getPersisted()); |
||
285 | } |
||
286 | |||
287 | /** |
||
288 | * @covers VDB\Spider\Spider::crawl |
||
289 | */ |
||
290 | public function testCrawlDFSMaxQueueSize() |
||
1 ignored issue
–
show
This method seems to be duplicated in your project.
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository.
Loading history...
|
|||
291 | { |
||
292 | $this->spider->getDiscovererSet()->maxDepth = 1000; |
||
293 | $this->spider->downloadLimit = 3; |
||
294 | |||
295 | $this->spider->crawl(); |
||
296 | |||
297 | $expected = array( |
||
298 | $this->linkA, |
||
299 | $this->linkE, |
||
300 | $this->linkF, |
||
301 | ); |
||
302 | |||
303 | $this->assertEquals($expected, $this->statsHandler->getPersisted()); |
||
304 | } |
||
305 | |||
306 | public function testCrawlBFSMaxQueueSize() |
||
1 ignored issue
–
show
This method seems to be duplicated in your project.
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation. You can also find more detailed suggestions in the “Code” section of your repository.
Loading history...
|
|||
307 | { |
||
308 | $this->spider->getQueueManager()->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); |
||
309 | $this->spider->getDiscovererSet()->maxDepth = 1000; |
||
310 | $this->spider->downloadLimit = 3; |
||
311 | |||
312 | $this->spider->crawl(); |
||
313 | |||
314 | $expected = array( |
||
315 | $this->linkA, |
||
316 | $this->linkB, |
||
317 | $this->linkC, |
||
318 | ); |
||
319 | |||
320 | $this->assertEquals($expected, $this->statsHandler->getPersisted()); |
||
321 | } |
||
322 | |||
323 | /** |
||
324 | * @covers VDB\Spider\Spider::crawl |
||
325 | */ |
||
326 | public function testCrawlFailedRequest() |
||
327 | { |
||
328 | $this->requestHandler |
||
329 | ->expects($this->any()) |
||
330 | ->method('request') |
||
331 | ->will( |
||
332 | $this->throwException(new Exception('Failed mock request!')) |
||
333 | ); |
||
334 | |||
335 | $this->spider->crawl(); |
||
336 | $stats = $this->statsHandler; |
||
337 | |||
338 | $this->assertCount(0, $stats->getFiltered(), 'Filtered count'); |
||
339 | $this->assertCount(0, $stats->getPersisted(), 'Persisted count'); |
||
340 | $this->assertCount(1, $stats->getFailed(), 'Failed count'); |
||
341 | } |
||
342 | } |
||
343 |
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.