Completed
Push — master ( 527364...a780c1 )
by Vinicius
05:34
created

Crawler   A

Complexity

Total Complexity 17

Size/Duplication

Total Lines 144
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 7

Test Coverage

Coverage 37.29%

Importance

Changes 0
Metric Value
wmc 17
lcom 1
cbo 7
dl 0
loc 144
ccs 22
cts 59
cp 0.3729
rs 10
c 0
b 0
f 0

7 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 16 4
A getResults() 0 28 4
A createResult() 0 13 1
A parseUrl() 0 4 1
A getGoogleUrl() 0 10 2
A isImageSuggestion() 0 8 1
A parseDomElement() 0 22 4
1
<?php
2
namespace CViniciusSDias\GoogleCrawler;
3
4
use CViniciusSDias\GoogleCrawler\Exception\InvalidGoogleHtmlException;
5
use CViniciusSDias\GoogleCrawler\Exception\InvalidResultException;
6
use CViniciusSDias\GoogleCrawler\Proxy\{
7
    GoogleProxyInterface, NoProxy
8
};
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
use Symfony\Component\DomCrawler\Link;
11
use DOMElement;
12
13
/**
14
 * Google Crawler
15
 *
16
 * @package CViniciusSDias\GoogleCrawler
17
 * @author Vinicius Dias
18
 */
19
class Crawler
20
{
21
    /** @var GoogleProxyInterface $proxy */
22
    protected $proxy;
23
    /** @var SearchTermInterface $searchTerm */
24
    private $searchTerm;
25
    /** @var string $countrySpecificSuffix */
26
    private $googleDomain;
27
    /** @var string $countryCode */
28
    private $countryCode;
29
30 7
    public function __construct(
31
        SearchTermInterface $searchTerm,
32
        GoogleProxyInterface $proxy = null,
33
        string $googleDomain = 'google.com',
34
        string $countryCode = ''
35
    ) {
36 7
        $this->proxy = is_null($proxy) ? new NoProxy() : $proxy;
0 ignored issues
show
Documentation Bug introduced by
It seems like is_null($proxy) ? new \C...roxy\NoProxy() : $proxy of type object<CViniciusSDias\Go...y\GoogleProxyInterface> is incompatible with the declared type object<GoogleProxyInterface> of property $proxy.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
37 7
        $this->searchTerm = $searchTerm;
38
39 7
        if (mb_stripos($googleDomain, 'google.') === false || mb_stripos($googleDomain, 'http') === 0) {
40 2
            throw new \InvalidArgumentException('Invalid google domain');
41
        }
42 5
        $this->googleDomain = $googleDomain;
43
44 5
        $this->countryCode = mb_strtoupper($countryCode);
45 5
    }
46
47
    /**
48
     * Returns the 100 first found results for the specified search term
49
     *
50
     * @return ResultList
51
     * @throws \GuzzleHttp\Exception\ServerException If the proxy was overused
52
     * @throws \GuzzleHttp\Exception\ConnectException If the proxy is unavailable or $countrySpecificSuffix is invalid
53
     */
54 2
    public function getResults(): ResultList
55
    {
56 2
        $googleUrl = $this->getGoogleUrl();
57 2
        $response = $this->proxy->getHttpResponse($googleUrl);
58 1
        $stringResponse = (string) $response->getBody();
59 1
        $domCrawler = new DomCrawler($stringResponse);
60 1
        $googleResultList = $domCrawler->filterXPath('//div[@class="g" and h3[@class="r" and a]]');
61 1
        if ($googleResultList->count() === 0) {
62 1
            throw new InvalidGoogleHtmlException('No parseable element found');
63
        }
64
65
        $resultList = new ResultList($googleResultList->count());
66
67
        foreach ($googleResultList as $googleResultElement) {
68
            try {
69
                $parsedResult = $this->parseDomElement($googleResultElement);
70
                $resultList->addResult($parsedResult);
71
            } catch (InvalidResultException $exception) {
72
                error_log(
73
                    'Error parsing the following result: ' . print_r($googleResultElement, true),
74
                    3,
75
                    __DIR__ . '/../var/log'
76
                );
77
            }
78
        }
79
80
        return $resultList;
81
    }
82
83
    /**
84
     * If $resultLink is a valid link, this method assembles the Result and adds it to $googleResults
85
     *
86
     * @param Link $resultLink
87
     * @param DOMElement $descriptionElement
88
     * @return Result
89
     * @throws InvalidResultException
90
     */
91
    private function createResult(Link $resultLink, DOMElement $descriptionElement): Result
92
    {
93
        $description = $descriptionElement->nodeValue
94
            ?? 'A description for this result isn\'t available due to the robots.txt file.';
95
96
        $googleResult = new Result();
97
        $googleResult
98
            ->setTitle($resultLink->getNode()->nodeValue)
99
            ->setUrl($this->parseUrl($resultLink->getUri()))
100
            ->setDescription($description);
101
102
        return $googleResult;
103
    }
104
105
    /**
106
     * Parses the URL using the parser provided by $proxy
107
     *
108
     * @param string $url
109
     * @return string
110
     * @throws InvalidResultException
111
     */
112
    private function parseUrl(string $url): string
113
    {
114
        return $this->proxy->parseUrl($url);
115
    }
116
117
    /**
118
     * Assembles the Google URL using the previously informed data
119
     */
120 5
    private function getGoogleUrl(): string
121
    {
122 5
        $domain = $this->googleDomain;
123 5
        $url = "https://$domain/search?q={$this->searchTerm}&num=100";
124 5
        if (!empty($this->countryCode)) {
125 1
            $url .= "&gl={$this->countryCode}";
126
        }
127
128 5
        return $url;
129
    }
130
131
    private function isImageSuggestion(DomCrawler $resultCrawler)
132
    {
133
        $resultCount = $resultCrawler
134
            ->filterXpath('//div/a')
135
            ->count();
136
137
        return $resultCount > 0;
138
    }
139
140
    private function parseDomElement(DOMElement $result): Result
141
    {
142
        $resultCrawler = new DomCrawler($result);
143
        $linkElement = $resultCrawler->filterXPath('//h3[@class="r"]/a')->getNode(0);
144
        if (is_null($linkElement)) {
145
            throw new InvalidResultException('Link element not found');
146
        }
147
148
        $resultLink = new Link($linkElement, 'http://google.com/');
149
        $descriptionElement = $resultCrawler->filterXPath('//span[@class="st"]')->getNode(0);
150
151
        if (is_null($descriptionElement)) {
152
            throw new InvalidResultException('Description element not found');
153
        }
154
155
        if ($this->isImageSuggestion($resultCrawler)) {
156
            throw new InvalidResultException('Result is an image suggestion');
157
        }
158
159
        $googleResult = $this->createResult($resultLink, $descriptionElement);
160
        return $googleResult;
161
    }
162
}
163