Passed
Push — master ( 9abeb8...36c770 )
by Dev
13:09
created

Harvest::getResponse()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 1
CRAP Score 1

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 1
c 1
b 0
f 0
dl 0
loc 3
ccs 1
cts 1
cp 1
rs 10
cc 1
nc 1
nop 0
crap 1
1
<?php
2
3
namespace PiedWeb\UrlHarvester;
4
5
use PiedWeb\Curl\Request as CurlRequest;
6
use PiedWeb\Curl\Response;
7
use PiedWeb\TextAnalyzer\Analyzer as TextAnalyzer;
8
use Psr\Http\Message\UriInterface;
9
use Spatie\Robots\RobotsHeaders;
10
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
11
12
class Harvest
13
{
14
    use HarvestLinksTrait;
15
    use RobotsTxtTrait;
16
17
    const DEFAULT_USER_AGENT = 'SeoPocketCrawler - Open Source Bot for SEO Metrics';
18
19
    /**
20
     * @var Response
21
     */
22
    protected $response;
23
24
    /**
25
     * @var \Symfony\Component\DomCrawler\Crawler
26
     */
27
    protected $dom;
28
29
    /** @var string */
30
    protected $baseUrl;
31
32
    /** @var string */
33
    protected $domain;
34
35
    /** @var bool */
36
    protected $follow;
37
38
    /** @var string */
39
    private $domainWithScheme;
0 ignored issues
show
introduced by
The private property $domainWithScheme is not used, and could be removed.
Loading history...
40
41
    /** @var \PiedWeb\TextAnalyzer\Analysis */
42
    private $textAnalysis;
43
44
    /** @var UriInterface */
45
    protected $urlRequested;
46
47
    /** @var UriInterface */
48
    protected $url;
49
50
    /**
51 15
     * @return self|int
52
     */
53
    public static function fromUrl(
54
        string $url,
55
        string $userAgent = self::DEFAULT_USER_AGENT,
56
        string $language = 'en,en-US;q=0.5',
57 15
        ?CurlRequest $previousRequest = null
58
    ) {
59 15
        $url = Link::normalizeUrl($url); // add trailing slash for domain
60 15
61
        $response = Request::makeFromRequest($previousRequest, $url, $userAgent, $language);
62
63
        if ($response instanceof Response) {
64
            return new self($response);
65
        }
66
67
        return $response;
68
    }
69 18
70
    public function __construct(Response $response)
71 18
    {
72 18
        $this->response = $response;
73
74 15
        $this->url = new Url($this->response->getEffectiveUrl());
0 ignored issues
show
Documentation Bug introduced by
It seems like new PiedWeb\UrlHarvester...nse->getEffectiveUrl()) of type PiedWeb\UrlHarvester\Url is incompatible with the declared type Psr\Http\Message\UriInterface of property $url.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
75
        $this->urlRequested = new Url($this->response->getUrl());
0 ignored issues
show
Documentation Bug introduced by
It seems like new PiedWeb\UrlHarvester...is->response->getUrl()) of type PiedWeb\UrlHarvester\Url is incompatible with the declared type Psr\Http\Message\UriInterface of property $urlRequested.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
76 15
    }
77
78
    public function urlRequested(): Url
79 3
    {
80
        return $this->urlRequested;
81 3
    }
82 3
83 3
    /**
84 3
     * Return url response (curl effective url)
85
     * // todo : check if urlRequested can be diffenrent than url (depends on curl wrench).
86
     */
87
    public function url(): Url
88
    {
89
        return $this->url;
90 21
    }
91
92 21
    public function getUrl(): Url
93 9
    {
94 9
        return $this->url;
95
    }
96
97 21
    public function getResponse(): Response
98
    {
99
        return $this->response;
100 18
    }
101
102 18
    public function getDom()
103
    {
104
        $this->dom = $this->dom ?? new DomCrawler($this->response->getContent());
105 15
106
        return $this->dom;
107 15
    }
108
109
    private function find($selector, $i = null): DomCrawler
110
    {
111
        return null !== $i ? $this->getDom()->filter($selector)->eq($i) : $this->getDom()->filter($selector);
112
    }
113
114
    /**
115 3
     * Alias for find($selector, 0).
116
     */
117 3
    private function findOne($selector): DomCrawler
118
    {
119 3
        return $this->find($selector, 0);
120
    }
121
122 6
    /**
123
     * Return content inside a selector.
124 6
     * Eg.: getTag('title').
125 6
     *
126 6
     * @return string
127
     */
128
    public function getTag($selector)
129 6
    {
130
        $found = $this->findOne($selector);
131
132
        return null !== $found ? Helper::clean($found->text()) : null;
133
    }
134
135
    public function getUniqueTag($selector = 'title')
136
    {
137
        $found = $this->find($selector);
138
139 15
        if (0 === $found->count()) {
140
            return null;
141 15
        }
142
143 15
        if ($found->count() > 1) {
144
            return $found->count().' `'.$selector.'` /!\ ';
145
        }
146
147
        return Helper::clean($found->eq(0)->text());
148
    }
149
150
    /**
151 9
     * Return content inside a meta.
152
     *
153 9
     * @return string|null from content attribute
154
     */
155 9
    public function getMeta(string $name): ?string
156
    {
157
        $meta = $this->findOne('meta[name='.$name.']');
158
159
        return $meta->count() > 0 ? (null !== $meta->attr('content') ? Helper::clean($meta->attr('content')) : '') : null;
160
    }
161 9
162
    /**
163 9
     * Renvoie le contenu de l'attribut href de la balise link rel=canonical.
164
     */
165 9
    public function getCanonical(): ?string
166
    {
167
        $canonical = $this->findOne('link[rel=canonical]');
168 3
169
        return $canonical->count() > 0 ? (null !== $canonical->attr('href') ? $canonical->attr('href') : '') : null;
170 3
    }
171 3
172 3
    /*
173 3
     * @return bool true si canonical = url requested or no canonical balise
174 3
     */
175 3
    public function isCanonicalCorrect(): bool
176
    {
177
        $canonical = $this->getCanonical();
178
179 3
        return null === $canonical ? true : $this->urlRequested()->get() == $canonical;
180
    }
181
182 3
    public function getTextAnalysis()
183
    {
184 3
        if (null === $this->textAnalysis) {
185
            $this->textAnalysis = TextAnalyzer::get(
186
                $this->getDom()->text(),
187
                true,   // only sentences
188
                1,      // no expression, just words
189
                0      // keep trail
190 3
            );
191
        }
192 3
193 3
        return $this->textAnalysis;
194
    }
195 3
196
    public function getKws()
197
    {
198
        return $this->getTextAnalysis()->getExpressions(10);
199
    }
200
201
    public function getRatioTxtCode(): int
202
    {
203 3
        $textLenght = strlen($this->getDom()->text());
204
        $htmlLenght = strlen(Helper::clean($this->response->getContent()));
205 3
206 3
        return (int) ($htmlLenght > 0 ? round($textLenght / $htmlLenght * 100) : 0);
207 3
    }
208 3
209
    /**
210
     * Return an array of object with two elements Link and anchor.
211 3
     */
212
    public function getBreadCrumb(?string $separator = null): ?array
213
    {
214
        $breadcrumb = ExtractBreadcrumb::get($this);
215
216
        if (null !== $separator && is_array($breadcrumb)) {
217
            $breadcrumb = array_map(function ($item) {
218 3
                return $item->getCleanName();
219
            }, $breadcrumb);
220
            $breadcrumb = implode($separator, $breadcrumb);
221
        }
222
223
        return $breadcrumb;
224
    }
225
226
    /**
227
     * @return ?string absolute url
228
     */
229
    public function getRedirection(): ?string
230
    {
231
        $headers = $this->response->getHeaders();
232
        $headers = array_change_key_case($headers ? $headers : []);
233
        if (isset($headers['location']) && ExtractLinks::isWebLink($headers['location'])) {
234
            return $this->url()->resolve($headers['location']);
235
        }
236
237 3
        return null;
238
    }
239 3
240 3
    public function getRedirectionLink(): ?Link
241 3
    {
242
        $redirection = $this->getRedirection();
243
244 3
        if (null !== $redirection) {
245
            return Link::createRedirection($redirection, $this);
246
        }
247
248 3
        return null;
249
    }
250
251 6
    public function isRedirectToHttps(): bool
252
    {
253 6
        $redirUrl = $this->getRedirection();
254 6
255 6
        return null !== $redirUrl && preg_replace('#^http:#', 'https:', $this->urlRequested()->get(), 1) == $redirUrl;
256 6
    }
257
258
    /**
259 6
     * Return the value in base tag if exist, else, current Url.
260
     */
261
    public function getBaseUrl(): string
262
    {
263
        if (!isset($this->baseUrl)) {
264
            $base = $this->findOne('base');
265 9
            if (null !== $base && isset($base->href) && filter_var($base->href, FILTER_VALIDATE_URL)) {
266
                $this->baseUrl = $base->href;
267 9
            } else {
268
                $this->baseUrl = $this->url()->get();
269
            }
270 12
        }
271
272 12
        return $this->baseUrl;
273 12
    }
274
275
    /**
276 12
     * @return int correspond to a const from Indexable
277
     */
278
    public function isIndexable(string $userAgent = 'googlebot'): int
279 12
    {
280
        return Indexable::isIndexable($this, $userAgent);
281 12
    }
282
283 12
    protected function metaAuthorizeToFollow()
284
    {
285
        return !(strpos($this->getMeta('googlebot'), 'nofollow') || strpos($this->getMeta('robots'), 'nofollow'));
286 3
    }
287
288 3
    public function mayFollow()
289
    {
290
        if (null === $this->follow) {
291 3
            $robotsHeaders = new RobotsHeaders($this->response->getHeaders());
292
            $this->follow = $robotsHeaders->mayFollow() && $this->metaAuthorizeToFollow() ? true : false;
293 3
        }
294 3
295 3
        return $this->follow;
296
    }
297
}
298