Completed
Push — master ( f523e0...cc6383 )
by Vinicius
04:37 queued 03:24
created

Crawler::__construct()   A

Complexity

Conditions 4
Paths 4

Size

Total Lines 14

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 8
CRAP Score 4

Importance

Changes 0
Metric Value
dl 0
loc 14
ccs 8
cts 8
cp 1
rs 9.7998
c 0
b 0
f 0
cc 4
nc 4
nop 4
crap 4
1
<?php
2
namespace CViniciusSDias\GoogleCrawler;
3
4
use CViniciusSDias\GoogleCrawler\Exception\InvalidResultException;
5
use CViniciusSDias\GoogleCrawler\Proxy\{
6
    GoogleProxyInterface, NoProxy
7
};
8
use Psr\Http\Message\ResponseInterface;
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
use Symfony\Component\DomCrawler\Link;
11
use DOMElement;
12
13
/**
14
 * Google Crawler
15
 *
16
 * @package CViniciusSDias\GoogleCrawler
17
 * @author Vinicius Dias
18
 */
19
class Crawler
20
{
21
    /** @var GoogleProxyInterface $proxy */
22
    protected $proxy;
23
    /** @var SearchTermInterface $searchTerm */
24
    private $searchTerm;
25
    /** @var string $countrySpecificSuffix */
26
    private $googleDomain;
27
    /** @var string $countryCode */
28
    private $countryCode;
29
30 11
    public function __construct(
31
        SearchTermInterface $searchTerm, GoogleProxyInterface $proxy = null,
32
        string $googleDomain = 'google.com', string $countryCode = ''
33
    ) {
34 11
        $this->proxy = is_null($proxy) ? new NoProxy() : $proxy;
0 ignored issues
show
Documentation Bug introduced by
It seems like is_null($proxy) ? new \C...roxy\NoProxy() : $proxy of type object<CViniciusSDias\Go...y\GoogleProxyInterface> is incompatible with the declared type object<GoogleProxyInterface> of property $proxy.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
35 11
        $this->searchTerm = $searchTerm;
36
37 11
        if (mb_stripos($googleDomain, 'google.') === false || mb_stripos($googleDomain, 'http') === 0) {
38 2
            throw new \InvalidArgumentException('Invalid google domain');
39
        }
40 9
        $this->googleDomain = $googleDomain;
41
42 9
        $this->countryCode = mb_strtoupper($countryCode);
43 9
    }
44
45
    /**
46
     * Returns the 100 first found results for the specified search term
47
     *
48
     * @return ResultList
49
     * @throws \GuzzleHttp\Exception\ServerException If the proxy was overused
50
     * @throws \GuzzleHttp\Exception\ConnectException If the proxy is unavailable or $countrySpecificSuffix is invalid
51
     */
52 6
    public function getResults(): ResultList
53
    {
54 6
        $googleUrl = $this->getGoogleUrl();
55
        /** @var ResponseInterface $response */
56 6
        $response = $this->proxy->getHttpResponse($googleUrl);
57 5
        $stringResponse = (string) $response->getBody();
58 5
        $domCrawler = new DomCrawler($stringResponse);
59 5
        $googleResults = $domCrawler->filterXPath('//div[@class="g" and h3[@class="r" and a]]');
60 5
        $resultList = new ResultList($googleResults->count());
61
62 5
        foreach ($googleResults as $result) {
63 5
            $resultCrawler = new DomCrawler($result);
64 5
            $linkElement = $resultCrawler->filterXPath('//h3[@class="r"]/a')->getNode(0);
65 5
            $resultLink = new Link($linkElement, 'http://google.com/');
0 ignored issues
show
Bug introduced by
It seems like $linkElement defined by $resultCrawler->filterXP...ss="r"]/a')->getNode(0) on line 64 can be null; however, Symfony\Component\DomCra...iElement::__construct() does not accept null, maybe add an additional type check?

Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code:

/** @return stdClass|null */
function mayReturnNull() { }

function doesNotAcceptNull(stdClass $x) { }

// With potential error.
function withoutCheck() {
    $x = mayReturnNull();
    doesNotAcceptNull($x); // Potential error here.
}

// Safe - Alternative 1
function withCheck1() {
    $x = mayReturnNull();
    if ( ! $x instanceof stdClass) {
        throw new \LogicException('$x must be defined.');
    }
    doesNotAcceptNull($x);
}

// Safe - Alternative 2
function withCheck2() {
    $x = mayReturnNull();
    if ($x instanceof stdClass) {
        doesNotAcceptNull($x);
    }
}
Loading history...
66 5
            $descriptionElement = $resultCrawler->filterXPath('//span[@class="st"]')->getNode(0);
67
            try {
68 5
                if ($this->isImageSuggestion($resultCrawler)) {
69 5
                    throw new InvalidResultException();
70
                }
71
72 5
                $googleResult = $this->parseResult($resultLink, $descriptionElement);
0 ignored issues
show
Bug introduced by
It seems like $descriptionElement defined by $resultCrawler->filterXP...ass="st"]')->getNode(0) on line 66 can be null; however, CViniciusSDias\GoogleCra...\Crawler::parseResult() does not accept null, maybe add an additional type check?

Unless you are absolutely sure that the expression can never be null because of other conditions, we strongly recommend to add an additional type check to your code:

/** @return stdClass|null */
function mayReturnNull() { }

function doesNotAcceptNull(stdClass $x) { }

// With potential error.
function withoutCheck() {
    $x = mayReturnNull();
    doesNotAcceptNull($x); // Potential error here.
}

// Safe - Alternative 1
function withCheck1() {
    $x = mayReturnNull();
    if ( ! $x instanceof stdClass) {
        throw new \LogicException('$x must be defined.');
    }
    doesNotAcceptNull($x);
}

// Safe - Alternative 2
function withCheck2() {
    $x = mayReturnNull();
    if ($x instanceof stdClass) {
        doesNotAcceptNull($x);
    }
}
Loading history...
73 5
                $resultList->addResult($googleResult);
74 5
            } catch (InvalidResultException $invalidResult) {
75
                // Maybe log this exception. Other than that, there's nothing to do, cause it isn't an error.
76
            }
77
        }
78
79 5
        return $resultList;
80
    }
81
82
    /**
83
     * If $resultLink is a valid link, this method assembles the Result and adds it to $googleResults
84
     *
85
     * @param Link $resultLink
86
     * @param DOMElement $descriptionElement
87
     * @return Result
88
     * @throws InvalidResultException
89
     */
90 5
    private function parseResult(Link $resultLink, DOMElement $descriptionElement): Result
91
    {
92 5
        $description = $descriptionElement->nodeValue
93 5
            ?? 'A description for this result isn\'t available due to the robots.txt file.';
94
95 5
        $googleResult = new Result();
96
        $googleResult
97 5
            ->setTitle($resultLink->getNode()->nodeValue)
98 5
            ->setUrl($this->parseUrl($resultLink->getUri()))
99 5
            ->setDescription($description);
100
101 5
        return $googleResult;
102
    }
103
104
    /**
105
     * Parses the URL using the parser provided by $proxy
106
     *
107
     * @param string $url
108
     * @return string
109
     * @throws InvalidResultException
110
     */
111 5
    private function parseUrl(string $url): string
112
    {
113 5
        return $this->proxy->parseUrl($url);
114
    }
115
116
    /**
117
     * Assembles the Google URL using the previously informed data
118
     */
119 9
    private function getGoogleUrl(): string
120
    {
121 9
        $domain = $this->googleDomain;
122 9
        $url = "https://$domain/search?q={$this->searchTerm}&num=100";
123 9
        if (!empty($this->countryCode)) {
124 2
            $url .= "&gl={$this->countryCode}";
125
        }
126
127 9
        return $url;
128
    }
129
130 5
    private function isImageSuggestion(DomCrawler $resultCrawler)
131
    {
132
        $resultCount = $resultCrawler
133 5
            ->filterXpath('//div/a')
134 5
            ->count();
135
136 5
        return $resultCount > 0;
137
    }
138
}
139