1
|
|
|
<?php |
2
|
|
|
namespace CViniciusSDias\GoogleCrawler; |
3
|
|
|
|
4
|
|
|
use CViniciusSDias\GoogleCrawler\Exception\InvalidResultException; |
5
|
|
|
use CViniciusSDias\GoogleCrawler\Proxy\{ |
6
|
|
|
GoogleProxy, NoProxy |
7
|
|
|
}; |
8
|
|
|
use Psr\Http\Message\ResponseInterface; |
9
|
|
|
use Symfony\Component\DomCrawler\Crawler as DomCrawler; |
10
|
|
|
use Symfony\Component\DomCrawler\Link; |
11
|
|
|
use DOMElement; |
12
|
|
|
|
13
|
|
|
/** |
14
|
|
|
* Google Crawler |
15
|
|
|
* |
16
|
|
|
* @package CViniciusSDias\GoogleCrawler |
17
|
|
|
* @author Vinicius Dias |
18
|
|
|
*/ |
19
|
|
|
class Crawler |
20
|
|
|
{ |
21
|
|
|
/** @var string $url*/ |
22
|
|
|
protected $url; |
23
|
|
|
/** @var GoogleProxy $proxy */ |
24
|
|
|
protected $proxy; |
25
|
|
|
|
26
|
13 |
|
public function __construct(SearchTermInterface $searchTerm, GoogleProxy $proxy = null) |
27
|
|
|
{ |
28
|
|
|
// You can concatenate &gl=XX replacing XX with your country code (BR = Brazil; US = United States) |
29
|
|
|
// You should also add the coutry specific part of the google url, (like .br or .es) |
30
|
13 |
|
$this->url = "http://www.google.com/search?q=$searchTerm&num=100"; |
31
|
13 |
|
$this->proxy = is_null($proxy) ? new NoProxy() : $proxy; |
|
|
|
|
32
|
13 |
|
} |
33
|
|
|
|
34
|
|
|
/** |
35
|
|
|
* Returns the 100 first found results for the specified search term |
36
|
|
|
* |
37
|
|
|
* @return ResultList |
38
|
|
|
* @throws \GuzzleHttp\Exception\ServerException If the proxy was overused |
39
|
|
|
* @throws \GuzzleHttp\Exception\ConnectException If the proxy is unavailable |
40
|
|
|
*/ |
41
|
13 |
|
public function getResults(): ResultList |
42
|
|
|
{ |
43
|
|
|
/** @var ResponseInterface $response */ |
44
|
13 |
|
$response = $this->proxy->getHttpResponse($this->url); |
45
|
13 |
|
$stringResponse = (string) $response->getBody(); |
46
|
13 |
|
$domCrawler = new DomCrawler($stringResponse); |
47
|
13 |
|
$googleResults = $domCrawler->filterXPath('//div[@class="g" and h3[@class="r" and a]]'); |
48
|
13 |
|
$resultList = new ResultList($googleResults->count()); |
49
|
|
|
|
50
|
13 |
|
foreach ($googleResults as $result) { |
51
|
13 |
|
$resultCrawler = new DomCrawler($result); |
52
|
13 |
|
$linkElement = $resultCrawler->filterXPath('//h3[@class="r"]/a')->getNode(0); |
53
|
13 |
|
$resultLink = new Link($linkElement, 'http://google.com/'); |
|
|
|
|
54
|
13 |
|
$descriptionElement = $resultCrawler->filterXPath('//span[@class="st"]')->getNode(0); |
55
|
|
|
try { |
56
|
13 |
|
$googleResult = $this->parseResult($resultLink, $descriptionElement); |
|
|
|
|
57
|
13 |
|
$resultList->addResult($googleResult); |
58
|
|
|
} catch (InvalidResultException $invalidResult) { |
59
|
|
|
// TODO Maybe log this exception. Other than that, there's nothing to do, cause it isn't an error. |
60
|
|
|
} |
61
|
|
|
} |
62
|
|
|
|
63
|
13 |
|
return $resultList; |
64
|
|
|
} |
65
|
|
|
|
66
|
|
|
/** |
67
|
|
|
* If $resultLink is a valid link, this method assembles the Result and adds it to $googleResults |
68
|
|
|
* |
69
|
|
|
* @param Link $resultLink |
70
|
|
|
* @param DOMElement $descriptionElement |
71
|
|
|
* @return Result |
72
|
|
|
* @throws InvalidResultException |
73
|
|
|
*/ |
74
|
13 |
|
private function parseResult(Link $resultLink, DOMElement $descriptionElement): Result |
75
|
|
|
{ |
76
|
13 |
|
$description = $descriptionElement->nodeValue |
77
|
13 |
|
?? 'A description for this result isn\'t available due to the robots.txt file.'; |
78
|
|
|
|
79
|
13 |
|
$googleResult = new Result(); |
80
|
|
|
$googleResult |
81
|
13 |
|
->setTitle($resultLink->getNode()->nodeValue) |
82
|
13 |
|
->setUrl($this->getUrl($resultLink->getUri())) |
83
|
13 |
|
->setDescription($description); |
84
|
|
|
|
85
|
13 |
|
return $googleResult; |
86
|
|
|
} |
87
|
|
|
|
88
|
|
|
/** |
89
|
|
|
* Parses the URL using the parser provided by $proxy |
90
|
|
|
* |
91
|
|
|
* @param string $url |
92
|
|
|
* @return string |
93
|
|
|
* @throws InvalidResultException |
94
|
|
|
*/ |
95
|
13 |
|
private function getUrl(string $url): string |
96
|
|
|
{ |
97
|
13 |
|
return $this->proxy->parseUrl($url); |
98
|
|
|
} |
99
|
|
|
} |
100
|
|
|
|
Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.
Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..