1
|
|
|
<?php |
2
|
|
|
namespace CViniciusSDias\GoogleCrawler; |
3
|
|
|
|
4
|
|
|
use CViniciusSDias\GoogleCrawler\Exception\InvalidResultException; |
5
|
|
|
use CViniciusSDias\GoogleCrawler\Proxy\{ |
6
|
|
|
GoogleProxy, NoProxy |
7
|
|
|
}; |
8
|
|
|
use Psr\Http\Message\ResponseInterface; |
9
|
|
|
use Symfony\Component\DomCrawler\Crawler as DomCrawler; |
10
|
|
|
use Symfony\Component\DomCrawler\Link; |
11
|
|
|
|
12
|
|
|
/** |
13
|
|
|
* Google Crawler |
14
|
|
|
* |
15
|
|
|
* @package CViniciusSDias\GoogleCrawler |
16
|
|
|
* @author Vinicius Dias |
17
|
|
|
*/ |
18
|
|
|
class Crawler |
19
|
|
|
{ |
20
|
|
|
/** @var string $url*/ |
21
|
|
|
protected $url; |
22
|
|
|
/** @var GoogleProxy $proxy */ |
23
|
|
|
protected $proxy; |
24
|
|
|
|
25
|
13 |
|
public function __construct(SearchTermInterface $searchTerm, GoogleProxy $proxy = null) |
26
|
|
|
{ |
27
|
|
|
// You can concatenate &gl=XX replacing XX with your country code (BR = Brazil; US = United States) |
28
|
|
|
// You should also add the coutry specific part of the google url, (like .br or .es) |
29
|
13 |
|
$this->url = "http://www.google.com/search?q=$searchTerm&num=100"; |
30
|
13 |
|
$this->proxy = is_null($proxy) ? new NoProxy() : $proxy; |
|
|
|
|
31
|
13 |
|
} |
32
|
|
|
|
33
|
|
|
/** |
34
|
|
|
* Returns the 100 first found results for the specified search term |
35
|
|
|
* |
36
|
|
|
* @return ResultList |
37
|
|
|
*/ |
38
|
13 |
|
public function getResults(): ResultList |
39
|
|
|
{ |
40
|
|
|
/** @var ResponseInterface $response */ |
41
|
13 |
|
$response = $this->proxy->getHttpResponse($this->url); |
42
|
13 |
|
$stringResponse = (string) $response->getBody(); |
43
|
13 |
|
$domCrawler = new DomCrawler($stringResponse); |
44
|
13 |
|
$googleResults = $domCrawler->filter('h3.r > a'); |
45
|
13 |
|
$resultList = new ResultList($googleResults->count()); |
46
|
|
|
|
47
|
13 |
|
foreach ($googleResults as $result) { |
48
|
13 |
|
$resultLink = new Link($result, 'http://google.com/'); |
49
|
|
|
try { |
50
|
13 |
|
$googleResult = $this->parseResult($resultLink); |
51
|
13 |
|
$resultList->addResult($googleResult); |
52
|
|
|
} catch (InvalidResultException $invalidResult) { |
53
|
|
|
// TODO Maybe log this exception. Other than that, there's nothing to do, cause it isn't an error. |
54
|
|
|
} |
55
|
|
|
} |
56
|
|
|
|
57
|
13 |
|
return $resultList; |
58
|
|
|
} |
59
|
|
|
|
60
|
|
|
/** |
61
|
|
|
* If $resultLink is a valid link, this method assembles the Result and adds it to $googleResults |
62
|
|
|
* |
63
|
|
|
* @param Link $resultLink |
64
|
|
|
* @return Result |
65
|
|
|
*/ |
66
|
13 |
|
private function parseResult(Link $resultLink): Result |
67
|
|
|
{ |
68
|
13 |
|
$googleResult = new Result(); |
69
|
|
|
$googleResult |
70
|
13 |
|
->setTitle($resultLink->getNode()->nodeValue) |
71
|
13 |
|
->setUrl($this->getUrl($resultLink->getUri())); |
72
|
|
|
|
73
|
13 |
|
return $googleResult; |
74
|
|
|
} |
75
|
|
|
|
76
|
|
|
/** |
77
|
|
|
* Parses the URL using the parser provided by $proxy |
78
|
|
|
* |
79
|
|
|
* @param string $url |
80
|
|
|
* @return string |
81
|
|
|
*/ |
82
|
13 |
|
private function getUrl(string $url): string |
83
|
|
|
{ |
84
|
13 |
|
return $this->proxy->parseUrl($url); |
85
|
|
|
} |
86
|
|
|
} |
87
|
|
|
|
Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.
Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..