Passed
Push — master ( ef5fa3...73b965 )
by Dev
11:08
created

Harvest::checkCanonicalException()   A

Complexity

Conditions 5
Paths 2

Size

Total Lines 9
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 5

Importance

Changes 0
Metric Value
eloc 5
c 0
b 0
f 0
dl 0
loc 9
ccs 6
cts 6
cp 1
rs 9.6111
cc 5
nc 2
nop 2
crap 5
1
<?php
2
3
namespace PiedWeb\UrlHarvester;
4
5
use PiedWeb\Curl\Request as CurlRequest;
6
use PiedWeb\Curl\Response;
7
use PiedWeb\TextAnalyzer\Analyzer as TextAnalyzer;
8
use Spatie\Robots\RobotsHeaders;
9
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
10
11
class Harvest
12
{
13
    use HarvestLinksTrait;
14
    use RobotsTxtTrait;
15
16
    public const DEFAULT_USER_AGENT = 'SeoPocketCrawler - Open Source Bot for SEO Metrics';
17
18
    /**
19
     * @var Response
20
     */
21
    protected $response;
22
23
    /**
24
     * @var \Symfony\Component\DomCrawler\Crawler
25
     */
26
    protected $dom;
27
28
    /** @var string */
29
    protected $baseUrl;
30
31
    /** @var bool */
32
    protected $follow;
33
34
    /** @var \PiedWeb\TextAnalyzer\Analysis */
35
    private $textAnalysis;
36
37
    /** @var Url */
38
    protected $urlRequested;
39
40
    /** @var Url */
41
    protected $url;
42
43
    /**
44
     * @return self|int
45
     */
46 18
    public static function fromUrl(
47
        string $url,
48
        string $userAgent = self::DEFAULT_USER_AGENT,
49
        string $language = 'en,en-US;q=0.5',
50
        ?CurlRequest $previousRequest = null
51
    ) {
52 18
        $url = Link::normalizeUrl($url); // add trailing slash for domain
53
54 18
        $response = Request::makeFromRequest($previousRequest, $url, $userAgent, $language);
55
56 18
        if ($response instanceof Response) {
57 18
            return new self($response);
58
        }
59
60
        return $response;
61
    }
62
63 27
    public function __construct(Response $response)
64
    {
65 27
        $this->response = $response;
66
67 27
        $this->url = new Url($this->response->getEffectiveUrl());
0 ignored issues
show
Bug introduced by
It seems like $this->response->getEffectiveUrl() can also be of type null; however, parameter $url of PiedWeb\UrlHarvester\Url::__construct() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

67
        $this->url = new Url(/** @scrutinizer ignore-type */ $this->response->getEffectiveUrl());
Loading history...
68 27
        $this->urlRequested = new Url($this->response->getUrl());
69 27
    }
70
71 6
    public function urlRequested(): Url
72
    {
73 6
        return $this->urlRequested;
74
    }
75
76
    /**
77
     * Return url response (curl effective url)
78
     * // todo : check if urlRequested can be diffenrent than url (depends on curl wrench).
79
     */
80 21
    public function url(): Url
81
    {
82 21
        return $this->url;
83
    }
84
85 15
    public function getUrl(): Url
86
    {
87 15
        return $this->url;
88
    }
89
90 18
    public function getResponse(): Response
91
    {
92 18
        return $this->response;
93
    }
94
95 33
    public function getDom()
96
    {
97 33
        $this->dom = $this->dom !== null ? $this->dom : new DomCrawler($this->response->getContent());
98
99 33
        return $this->dom;
100
    }
101
102 21
    private function find($selector, $i = null): DomCrawler
103
    {
104 21
        return null !== $i ? $this->getDom()->filter($selector)->eq($i) : $this->getDom()->filter($selector);
105
    }
106
107
    /**
108
     * Alias for find($selector, 0).
109
     */
110 18
    private function findOne($selector): DomCrawler
111
    {
112 18
        return $this->find($selector, 0);
113
    }
114
115
    /**
116
     * Return content inside a selector.
117
     * Eg.: getTag('title').
118
     *
119
     * @return ?string
120
     */
121 3
    public function getTag($selector)
122
    {
123 3
        $found = $this->findOne($selector);
124
125 3
        return $found->count() > 0 ? Helper::clean($found->text()) : null;
126
    }
127
128 6
    public function getUniqueTag($selector = 'title')
129
    {
130 6
        $found = $this->find($selector);
131
132 6
        if (0 === $found->count()) {
133 3
            return null;
134
        }
135
136 6
        if ($found->count() > 1) {
137
            return $found->count().' `'.$selector.'` /!\ ';
138
        }
139
140 6
        return Helper::clean($found->eq(0)->text());
141
    }
142
143
    /**
144
     * Return content inside a meta.
145
     *
146
     * @return string|null from content attribute
147
     */
148 18
    public function getMeta(string $name): ?string
149
    {
150 18
        $meta = $this->findOne('meta[name='.$name.']');
151
152 18
        return $meta->count() > 0 ? (null !== $meta->attr('content') ? Helper::clean($meta->attr('content')) : '')
153 18
            : null;
154
    }
155
156
    /**
157
     * Renvoie le contenu de l'attribut href de la balise link rel=canonical.
158
     */
159 12
    public function getCanonical(): ?string
160
    {
161 12
        $canonical = $this->findOne('link[rel=canonical]');
162
163 12
        return $canonical->count() > 0 ? (null !== $canonical->attr('href') ? $canonical->attr('href') : '') : null;
164
    }
165
166
    /*
167
     * @return bool true si canonical = url requested or no canonical balise
168
     */
169 12
    public function isCanonicalCorrect(?string $urlRequested = null): bool
170
    {
171 12
        $canonical = $this->getCanonical();
172
173 12
        if (null === $canonical)
174
            return true;
175
176 6
        $urlRequested = $urlRequested ?? $this->urlRequested()->get();
177
178 6
        if ($urlRequested == $canonical)
179 3
            return true;
180
181
         return $this->checkCanonicalException($urlRequested, $canonical);
182 3
    }
183 3
184 3
    private function checkCanonicalException(string $urlRequested, string $canonical): bool
185 3
    {
186 3
        if (preg_match('/^.+?[^\/:](?=[?\/]|$)/', $urlRequested, $match) !== false
187 3
            && $match[0] === ltrim($urlRequested, '/')
188
            && ($match[0] == $canonical || $match[0].'/' == $canonical)) {
189
            return true;
190 3
        }
191
192 3
        return false;
193
    }
194
195 3
    public function getTextAnalysis()
196
    {
197 3
        if (null !== $this->textAnalysis) {
198
            return $this->textAnalysis;
199
        }
200 3
201
        return $this->textAnalysis = $this->getDom()->count() > 0 ? TextAnalyzer::get(
202 3
            $this->getDom()->text(),
203 3
            true,   // only sentences
204
            1,      // no expression, just words
205 3
            0      // keep trail
206
        ) : null;
207
    }
208
209
    public function getWordCount(): int
210
    {
211 3
        return (int) str_word_count($this->getDom()->text('') ?? '');
212
    }
213 3
214
    public function getKws()
215 3
    {
216
        return $this->getTextAnalysis()->getExpressions(10);
217
    }
218
219
    public function getRatioTxtCode(): int
220
    {
221
        $textLenght = strlen($this->getDom()->text(''));
222 3
        $htmlLenght = strlen(Helper::clean($this->response->getContent()));
223
224
        return (int) ($htmlLenght > 0 ? round($textLenght / $htmlLenght * 100) : 0);
225
    }
226
227
    /**
228 3
     * Return an array of object with two elements Link and anchor.
229
     */
230 3
    public function getBreadCrumb(?string $separator = null)
231 3
    {
232 3
        $breadcrumb = ExtractBreadcrumb::get($this);
233 3
234
        if (null !== $separator && is_array($breadcrumb)) {
235
            $breadcrumb = array_map(function ($item) {
236
                return $item->getCleanName();
237
            }, $breadcrumb);
238
            $breadcrumb = implode($separator, $breadcrumb);
239
        }
240
241
        return $breadcrumb;
242
    }
243
244
    /**
245
     * @return ?string absolute url
246
     */
247
    public function getRedirection(): ?string
248
    {
249
        $headers = $this->response->getHeaders();
250
        $headers = array_change_key_case($headers ? $headers : []);
251
        if (isset($headers['location']) && ExtractLinks::isWebLink($headers['location'])) {
252
            return $this->url()->resolve($headers['location']);
253
        }
254
255
        return null;
256
    }
257
258
    public function getRedirectionLink(): ?Link
259
    {
260 3
        $redirection = $this->getRedirection();
261
262 3
        if (null !== $redirection) {
263 3
            return Link::createRedirection($redirection, $this);
264 3
        }
265
266
        return null;
267 3
    }
268
269
    public function isRedirectToHttps(): bool
270
    {
271 3
        $redirUrl = $this->getRedirection();
272
273
        return null !== $redirUrl && preg_replace('#^http:#', 'https:', $this->urlRequested()->get(), 1) == $redirUrl;
274
    }
275
276
    /**
277 9
     * Return the value in base tag if exist, else, current Url.
278
     */
279 9
    public function getBaseUrl(): string
280
    {
281
        if (! $this->baseUrl) {
282 3
            $base = $this->findOne('base');
283
            if ($base->getBaseHref() && filter_var($base->getBaseHref(), FILTER_VALIDATE_URL)) {
284 3
                $this->baseUrl = $base->getBaseHref();
285
            } else {
286
                $this->baseUrl = $this->url()->get();
287 6
            }
288
        }
289 6
290
        return $this->baseUrl;
291
    }
292 9
293
    /**
294 9
     * @return int correspond to a const from Indexable
295 6
     */
296 6
    public function indexable(string $userAgent = 'googlebot'): int
297
    {
298
        return Indexable::indexable($this, $userAgent);
299 9
    }
300
301
    public function isIndexable(string $userAgent = 'googlebot'): bool
302
    {
303
        return Indexable::INDEXABLE === $this->indexable($userAgent);
304
    }
305
306
    protected function metaAuthorizeToFollow()
307
    {
308
        return ! (strpos($this->getMeta('googlebot'), 'nofollow') || strpos($this->getMeta('robots'), 'nofollow'));
309
    }
310
311
    public function mayFollow()
312
    {
313
        if (null === $this->follow) {
314
            $robotsHeaders = new RobotsHeaders((array) $this->response->getHeaders());
315
            $this->follow = $robotsHeaders->mayFollow() && $this->metaAuthorizeToFollow() ? true : false;
316
        }
317
318
        return $this->follow;
319
    }
320
}
321