Crawler::isImageSuggestion()   A
last analyzed

Complexity

Conditions 1
Paths 1

Size

Total Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
dl 0
loc 8
ccs 0
cts 4
cp 0
rs 10
c 0
b 0
f 0
cc 1
nc 1
nop 1
crap 2
1
<?php
2
namespace CViniciusSDias\GoogleCrawler;
3
4
use CViniciusSDias\GoogleCrawler\Exception\InvalidGoogleHtmlException;
5
use CViniciusSDias\GoogleCrawler\Exception\InvalidResultException;
6
use CViniciusSDias\GoogleCrawler\Proxy\{
7
    GoogleProxyInterface, NoProxy
8
};
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
use Symfony\Component\DomCrawler\Link;
11
use DOMElement;
12
13
/**
14
 * Google Crawler
15
 *
16
 * @package CViniciusSDias\GoogleCrawler
17
 * @author Vinicius Dias
18
 */
19
class Crawler
20
{
21
    /** @var GoogleProxyInterface $proxy */
22
    protected $proxy;
23
    /** @var SearchTermInterface $searchTerm */
24
    private $searchTerm;
25
    /** @var string $countrySpecificSuffix */
26
    private $googleDomain;
27
    /** @var string $countryCode */
28
    private $countryCode;
29
30 8
    public function __construct(
31
        SearchTermInterface $searchTerm,
32
        GoogleProxyInterface $proxy = null,
33
        string $googleDomain = 'google.com',
34
        string $countryCode = ''
35
    ) {
36 8
        $this->proxy = is_null($proxy) ? new NoProxy() : $proxy;
0 ignored issues
show
Documentation Bug introduced by
It seems like is_null($proxy) ? new \C...roxy\NoProxy() : $proxy of type object<CViniciusSDias\Go...y\GoogleProxyInterface> is incompatible with the declared type object<GoogleProxyInterface> of property $proxy.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
37 8
        $this->searchTerm = $searchTerm;
38
39 8
        if (stripos($googleDomain, 'google.') === false || stripos($googleDomain, 'http') === 0) {
40 2
            throw new \InvalidArgumentException('Invalid google domain');
41
        }
42 6
        $this->googleDomain = $googleDomain;
43
44 6
        $this->countryCode = strtoupper($countryCode);
45 6
    }
46
47
    /**
48
     * Returns the 100 first found results for the specified search term
49
     *
50
     * @return ResultList
51
     * @throws \GuzzleHttp\Exception\ServerException If the proxy was overused
52
     * @throws \GuzzleHttp\Exception\ConnectException If the proxy is unavailable or $countrySpecificSuffix is invalid
53
     */
54 2
    public function getResults(): ResultList
55
    {
56 2
        $googleUrl = $this->getGoogleUrl();
57 2
        $response = $this->proxy->getHttpResponse($googleUrl);
58 1
        $stringResponse = (string) $response->getBody();
59 1
        $domCrawler = new DomCrawler($stringResponse);
60 1
        $googleResultList = $domCrawler->filterXPath('//div[@class="ZINbbc xpd O9g5cc uUPGi"]');
61 1
        if ($googleResultList->count() === 0) {
62 1
            throw new InvalidGoogleHtmlException('No parseable element found');
63
        }
64
65
        $resultList = new ResultList($googleResultList->count());
66
67
        foreach ($googleResultList as $googleResultElement) {
68
            try {
69
                $parsedResult = $this->parseDomElement($googleResultElement);
70
                $resultList->addResult($parsedResult);
71
            } catch (InvalidResultException $exception) {
72
                error_log(
73
                    'Error parsing the following result: ' . print_r($googleResultElement, true),
74
                    3,
75
                    __DIR__ . '/../var/log/crawler-errors.log'
76
                );
77
            }
78
        }
79
80
        return $resultList;
81
    }
82
83
    /**
84
     * If $resultLink is a valid link, this method assembles the Result and adds it to $googleResults
85
     *
86
     * @param Link $resultLink
87
     * @param DOMElement $descriptionElement
88
     * @return Result
89
     * @throws InvalidResultException
90
     */
91
    private function createResult(Link $resultLink, DOMElement $descriptionElement): Result
92
    {
93
        $description = $descriptionElement->nodeValue
94
            ?? 'A description for this result isn\'t available due to the robots.txt file.';
95
96
        $googleResult = new Result();
97
        $googleResult
98
            ->setTitle($resultLink->getNode()->nodeValue)
99
            ->setUrl($this->parseUrl($resultLink->getUri()))
100
            ->setDescription($description);
101
102
        return $googleResult;
103
    }
104
105
    /**
106
     * Parses the URL using the parser provided by $proxy
107
     *
108
     * @param string $url
109
     * @return string
110
     * @throws InvalidResultException
111
     */
112
    private function parseUrl(string $url): string
113
    {
114
        return $this->proxy->parseUrl($url);
115
    }
116
117
    /**
118
     * Assembles the Google URL using the previously informed data
119
     */
120 6
    private function getGoogleUrl(): string
121
    {
122 6
        $domain = $this->googleDomain;
123 6
        $url = "https://$domain/search?q={$this->searchTerm}&num=100";
124 6
        if (!empty($this->countryCode)) {
125 2
            $url .= "&gl={$this->countryCode}";
126
        }
127
128 6
        return $url;
129
    }
130
131
    private function isImageSuggestion(DomCrawler $resultCrawler)
132
    {
133
        $resultCount = $resultCrawler
134
            ->filterXpath('//img')
135
            ->count();
136
137
        return $resultCount > 0;
138
    }
139
140
    private function parseDomElement(DOMElement $result): Result
141
    {
142
        $resultCrawler = new DomCrawler($result);
143
        $linkElement = $resultCrawler->filterXPath('//a')->getNode(0);
144
        if (is_null($linkElement)) {
145
            throw new InvalidResultException('Link element not found');
146
        }
147
148
        $resultLink = new Link($linkElement, 'http://google.com/');
149
        $descriptionElement = $resultCrawler->filterXPath('//div[@class="BNeawe s3v9rd AP7Wnd"]//div[@class="BNeawe s3v9rd AP7Wnd"]')->getNode(0);
150
151
        if (is_null($descriptionElement)) {
152
            throw new InvalidResultException('Description element not found');
153
        }
154
155
        if ($this->isImageSuggestion($resultCrawler)) {
156
            throw new InvalidResultException('Result is an image suggestion');
157
        }
158
159
        if (strpos($resultLink->getUri(), 'http://google.com') === false) {
160
            throw new InvalidResultException('Result is a google suggestion');
161
        }
162
163
        $googleResult = $this->createResult($resultLink, $descriptionElement);
164
        return $googleResult;
165
    }
166
}
167