Completed
Push — master ( 54b7d7...527364 )
by Vinicius
11:13
created

Crawler::parseDomElement()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 18

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 12

Importance

Changes 0
Metric Value
dl 0
loc 18
ccs 0
cts 0
cp 0
rs 9.6666
c 0
b 0
f 0
cc 3
nc 3
nop 1
crap 12
1
<?php
2
namespace CViniciusSDias\GoogleCrawler;
3
4
use CViniciusSDias\GoogleCrawler\Exception\InvalidGoogleHtmlException;
5
use CViniciusSDias\GoogleCrawler\Exception\InvalidResultException;
6
use CViniciusSDias\GoogleCrawler\Proxy\{
7
    GoogleProxyInterface, NoProxy
8
};
9
use Psr\Http\Message\ResponseInterface;
10
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
11
use Symfony\Component\DomCrawler\Link;
12
use DOMElement;
13
14
/**
15
 * Google Crawler
16
 *
17
 * @package CViniciusSDias\GoogleCrawler
18
 * @author Vinicius Dias
19
 */
20
class Crawler
21
{
22
    /** @var GoogleProxyInterface $proxy */
23
    protected $proxy;
24
    /** @var SearchTermInterface $searchTerm */
25
    private $searchTerm;
26
    /** @var string $countrySpecificSuffix */
27
    private $googleDomain;
28
    /** @var string $countryCode */
29
    private $countryCode;
30 16
31
    public function __construct(
32
        SearchTermInterface $searchTerm,
33
        GoogleProxyInterface $proxy = null,
34
        string $googleDomain = 'google.com',
35
        string $countryCode = ''
36 16
    ) {
37 16
        $this->proxy = is_null($proxy) ? new NoProxy() : $proxy;
0 ignored issues
show
Documentation Bug introduced by
It seems like is_null($proxy) ? new \C...roxy\NoProxy() : $proxy of type object<CViniciusSDias\Go...y\GoogleProxyInterface> is incompatible with the declared type object<GoogleProxyInterface> of property $proxy.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
38
        $this->searchTerm = $searchTerm;
39 16
40 2
        if (mb_stripos($googleDomain, 'google.') === false || mb_stripos($googleDomain, 'http') === 0) {
41
            throw new \InvalidArgumentException('Invalid google domain');
42 14
        }
43
        $this->googleDomain = $googleDomain;
44 14
45 14
        $this->countryCode = mb_strtoupper($countryCode);
46
    }
47
48
    /**
49
     * Returns the 100 first found results for the specified search term
50
     *
51
     * @return ResultList
52
     * @throws \GuzzleHttp\Exception\ServerException If the proxy was overused
53
     * @throws \GuzzleHttp\Exception\ConnectException If the proxy is unavailable or $countrySpecificSuffix is invalid
54 11
     */
55
    public function getResults(): ResultList
56 11
    {
57
        $googleUrl = $this->getGoogleUrl();
58 11
        /** @var ResponseInterface $response */
59 10
        $response = $this->proxy->getHttpResponse($googleUrl);
60 10
        $stringResponse = (string) $response->getBody();
61 10
        $domCrawler = new DomCrawler($stringResponse);
62 10
        $googleResultList = $domCrawler->filterXPath('//div[@class="g" and h3[@class="r" and a]]');
63
        if ($googleResultList->count() === 0) {
64 10
            throw new InvalidGoogleHtmlException('No parseable element found');
65 10
        }
66 10
67 10
        $resultList = new ResultList($googleResultList->count());
68 10
69
        foreach ($googleResultList as $googleResultElement) {
70 10
            try {
71 10
                $parsedResult = $this->parseDomElement($googleResultElement);
72
                $resultList->addResult($parsedResult);
73
            } catch (InvalidResultException $exception) {
74 10
                error_log(
75 10
                    'Error parsing the following result: ' . print_r($googleResultElement),
76 10
                    3,
77
                    __DIR__ . '/../var/log'
78
                );
79
            }
80
        }
81 10
82
        return $resultList;
83
    }
84
85
    /**
86
     * If $resultLink is a valid link, this method assembles the Result and adds it to $googleResults
87
     *
88
     * @param Link $resultLink
89
     * @param DOMElement $descriptionElement
90
     * @return Result
91
     * @throws InvalidResultException
92 10
     */
93
    private function createResult(Link $resultLink, DOMElement $descriptionElement): Result
94 10
    {
95 10
        $description = $descriptionElement->nodeValue
96
            ?? 'A description for this result isn\'t available due to the robots.txt file.';
97 10
98
        $googleResult = new Result();
99 10
        $googleResult
100 10
            ->setTitle($resultLink->getNode()->nodeValue)
101 10
            ->setUrl($this->parseUrl($resultLink->getUri()))
102
            ->setDescription($description);
103 10
104
        return $googleResult;
105
    }
106
107
    /**
108
     * Parses the URL using the parser provided by $proxy
109
     *
110
     * @param string $url
111
     * @return string
112
     * @throws InvalidResultException
113 10
     */
114
    private function parseUrl(string $url): string
115 10
    {
116
        return $this->proxy->parseUrl($url);
117
    }
118
119
    /**
120
     * Assembles the Google URL using the previously informed data
121 14
     */
122
    private function getGoogleUrl(): string
123 14
    {
124 14
        $domain = $this->googleDomain;
125 14
        $url = "https://$domain/search?q={$this->searchTerm}&num=100";
126 2
        if (!empty($this->countryCode)) {
127
            $url .= "&gl={$this->countryCode}";
128
        }
129 14
130
        return $url;
131
    }
132 10
133
    private function isImageSuggestion(DomCrawler $resultCrawler)
134
    {
135 10
        $resultCount = $resultCrawler
136 10
            ->filterXpath('//div/a')
137
            ->count();
138 10
139
        return $resultCount > 0;
140
    }
141
142
    private function parseDomElement(DOMElement $result): Result
143
    {
144
        $resultCrawler = new DomCrawler($result);
145
        $linkElement = $resultCrawler->filterXPath('//h3[@class="r"]/a')->getNode(0);
146
        $resultLink = new Link($linkElement, 'http://google.com/');
0 ignored issues
show
Bug introduced by
It seems like $linkElement defined by $resultCrawler->filterXP...ss="r"]/a')->getNode(0) on line 145 can be null; however, Symfony\Component\DomCra...iElement::__construct() does not accept null, maybe add an additional type check?

Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code:

/** @return stdClass|null */
function mayReturnNull() { }

function doesNotAcceptNull(stdClass $x) { }

// With potential error.
function withoutCheck() {
    $x = mayReturnNull();
    doesNotAcceptNull($x); // Potential error here.
}

// Safe - Alternative 1
function withCheck1() {
    $x = mayReturnNull();
    if ( ! $x instanceof stdClass) {
        throw new \LogicException('$x must be defined.');
    }
    doesNotAcceptNull($x);
}

// Safe - Alternative 2
function withCheck2() {
    $x = mayReturnNull();
    if ($x instanceof stdClass) {
        doesNotAcceptNull($x);
    }
}
Loading history...
147
        $descriptionElement = $resultCrawler->filterXPath('//span[@class="st"]')->getNode(0);
148
149
        if (is_null($descriptionElement)) {
150
            throw new InvalidResultException('Description element not found');
151
        }
152
153
        if ($this->isImageSuggestion($resultCrawler)) {
154
            throw new InvalidResultException('Result is an image suggestion');
155
        }
156
157
        $googleResult = $this->createResult($resultLink, $descriptionElement);
158
        return $googleResult;
159
    }
160
}
161