Completed
Push — master ( efb003...741aff )
by Vinicius
01:27
created

Crawler::getGoogleUrl()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 10
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 2.0185

Importance

Changes 0
Metric Value
dl 0
loc 10
ccs 5
cts 6
cp 0.8333
rs 9.4285
c 0
b 0
f 0
cc 2
eloc 6
nc 2
nop 0
crap 2.0185
1
<?php
2
namespace CViniciusSDias\GoogleCrawler;
3
4
use CViniciusSDias\GoogleCrawler\Exception\InvalidResultException;
5
use CViniciusSDias\GoogleCrawler\Proxy\{
6
    GoogleProxyInterface, NoProxy
7
};
8
use Psr\Http\Message\ResponseInterface;
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
use Symfony\Component\DomCrawler\Link;
11
use DOMElement;
12
13
/**
14
 * Google Crawler
15
 *
16
 * @package CViniciusSDias\GoogleCrawler
17
 * @author Vinicius Dias
18
 */
19
class Crawler
20
{
21
    /** @var GoogleProxyInterface $proxy */
22
    protected $proxy;
23
    /** @var SearchTermInterface $searchTerm */
24
    private $searchTerm;
25
    /** @var string $countrySpecificSuffix */
26
    private $googleDomain;
27
    /** @var string $countryCode */
28
    private $countryCode;
29
30 3
    public function __construct(
31
        SearchTermInterface $searchTerm, GoogleProxyInterface $proxy = null,
32
        string $googleDomain = 'google.com', string $countryCode = ''
33
    ) {
34 3
        $this->proxy = is_null($proxy) ? new NoProxy() : $proxy;
0 ignored issues
show
Documentation Bug introduced by
It seems like is_null($proxy) ? new \C...roxy\NoProxy() : $proxy of type object<CViniciusSDias\Go...y\GoogleProxyInterface> is incompatible with the declared type object<GoogleProxyInterface> of property $proxy.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
35 3
        $this->searchTerm = $searchTerm;
36
37 3
        if (mb_stripos($googleDomain, 'google.') === false || mb_stripos($googleDomain, 'http') === 0) {
38
            throw new \InvalidArgumentException('Invalid google domain');
39
        }
40 3
        $this->googleDomain = $googleDomain;
41
42 3
        $this->countryCode = mb_strtoupper($countryCode);
43 3
    }
44
45
    /**
46
     * Returns the 100 first found results for the specified search term
47
     *
48
     * @return ResultList
49
     * @throws \GuzzleHttp\Exception\ServerException If the proxy was overused
50
     * @throws \GuzzleHttp\Exception\ConnectException If the proxy is unavailable or $countrySpecificSuffix is invalid
51
     */
52 3
    public function getResults(): ResultList
53
    {
54 3
        $googleUrl = $this->getGoogleUrl();
55
        /** @var ResponseInterface $response */
56 3
        $response = $this->proxy->getHttpResponse($googleUrl);
57 3
        $stringResponse = (string) $response->getBody();
58 3
        $domCrawler = new DomCrawler($stringResponse);
59 3
        $googleResults = $domCrawler->filterXPath('//div[@class="g" and h3[@class="r" and a]]');
60 3
        $resultList = new ResultList($googleResults->count());
61
62 3
        foreach ($googleResults as $result) {
63 2
            $resultCrawler = new DomCrawler($result);
64 2
            $linkElement = $resultCrawler->filterXPath('//h3[@class="r"]/a')->getNode(0);
65 2
            $resultLink = new Link($linkElement, 'http://google.com/');
0 ignored issues
show
Bug introduced by
It seems like $linkElement defined by $resultCrawler->filterXP...ss="r"]/a')->getNode(0) on line 64 can be null; however, Symfony\Component\DomCra...iElement::__construct() does not accept null, maybe add an additional type check?

Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code:

/** @return stdClass|null */
function mayReturnNull() { }

function doesNotAcceptNull(stdClass $x) { }

// With potential error.
function withoutCheck() {
    $x = mayReturnNull();
    doesNotAcceptNull($x); // Potential error here.
}

// Safe - Alternative 1
function withCheck1() {
    $x = mayReturnNull();
    if ( ! $x instanceof stdClass) {
        throw new \LogicException('$x must be defined.');
    }
    doesNotAcceptNull($x);
}

// Safe - Alternative 2
function withCheck2() {
    $x = mayReturnNull();
    if ($x instanceof stdClass) {
        doesNotAcceptNull($x);
    }
}
Loading history...
66 2
            $descriptionElement = $resultCrawler->filterXPath('//span[@class="st"]')->getNode(0);
67
            try {
68 2
                $googleResult = $this->parseResult($resultLink, $descriptionElement);
0 ignored issues
show
Bug introduced by
It seems like $descriptionElement defined by $resultCrawler->filterXP...ass="st"]')->getNode(0) on line 66 can be null; however, CViniciusSDias\GoogleCra...\Crawler::parseResult() does not accept null, maybe add an additional type check?

Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code:

/** @return stdClass|null */
function mayReturnNull() { }

function doesNotAcceptNull(stdClass $x) { }

// With potential error.
function withoutCheck() {
    $x = mayReturnNull();
    doesNotAcceptNull($x); // Potential error here.
}

// Safe - Alternative 1
function withCheck1() {
    $x = mayReturnNull();
    if ( ! $x instanceof stdClass) {
        throw new \LogicException('$x must be defined.');
    }
    doesNotAcceptNull($x);
}

// Safe - Alternative 2
function withCheck2() {
    $x = mayReturnNull();
    if ($x instanceof stdClass) {
        doesNotAcceptNull($x);
    }
}
Loading history...
69 2
                $resultList->addResult($googleResult);
70 2
            } catch (InvalidResultException $invalidResult) {
71
                // Maybe log this exception. Other than that, there's nothing to do, cause it isn't an error.
72
            }
73
        }
74
75 3
        return $resultList;
76
    }
77
78
    /**
79
     * If $resultLink is a valid link, this method assembles the Result and adds it to $googleResults
80
     *
81
     * @param Link $resultLink
82
     * @param DOMElement $descriptionElement
83
     * @return Result
84
     * @throws InvalidResultException
85
     */
86 2
    private function parseResult(Link $resultLink, DOMElement $descriptionElement): Result
87
    {
88 2
        $description = $descriptionElement->nodeValue
89 2
            ?? 'A description for this result isn\'t available due to the robots.txt file.';
90
91 2
        $googleResult = new Result();
92
        $googleResult
93 2
            ->setTitle($resultLink->getNode()->nodeValue)
94 2
            ->setUrl($this->parseUrl($resultLink->getUri()))
95 2
            ->setDescription($description);
96
97 2
        return $googleResult;
98
    }
99
100
    /**
101
     * Parses the URL using the parser provided by $proxy
102
     *
103
     * @param string $url
104
     * @return string
105
     * @throws InvalidResultException
106
     */
107 2
    private function parseUrl(string $url): string
108
    {
109 2
        return $this->proxy->parseUrl($url);
110
    }
111
112
    /**
113
     * Assembles the Google URL using the previously informed data
114
     */
115 3
    private function getGoogleUrl(): string
116
    {
117 3
        $domain = $this->googleDomain;
118 3
        $url = "https://$domain/search?q={$this->searchTerm}&num=100";
119 3
        if (!empty($this->countryCode)) {
120
            $url .= "&gl={$this->countryCode}";
121
        }
122
123 3
        return $url;
124
    }
125
}
126