Issues (8)

src/Harvest.php (1 issue)

Labels
Severity
1
<?php
2
3
namespace PiedWeb\UrlHarvester;
4
5
use PiedWeb\Curl\Request as CurlRequest;
6
use PiedWeb\Curl\Response;
7
use PiedWeb\TextAnalyzer\Analysis;
8
use PiedWeb\TextAnalyzer\Analyzer as TextAnalyzer;
9
use Spatie\Robots\RobotsHeaders;
10
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
11
12
class Harvest
13
{
14
    use HarvestLinksTrait;
15
    use RobotsTxtTrait;
16
17
    public const DEFAULT_USER_AGENT = 'SeoPocketCrawler - Open Source Bot for SEO Metrics';
18
19
    protected Response $response;
20
21
    protected DomCrawler $dom;
22
23
    protected string $baseUrl;
24
25
    protected bool $follow;
26
27
    private Analysis $textAnalysis;
28
29
    protected Url $urlRequested;
30
31
    protected Url $url;
32
33
    /**
34
     * @return self|int
35
     */
36
    public static function fromUrl(
37
        string $url,
38
        string $userAgent = self::DEFAULT_USER_AGENT,
39
        string $language = 'en,en-US;q=0.5',
40
        ?CurlRequest $previousRequest = null
41
    ) {
42
        $url = Link::normalizeUrl($url); // add trailing slash for domain
43
44
        $response = Request::makeFromRequest($previousRequest, $url, $userAgent, $language);
45
46 18
        if ($response instanceof Response) {
47
            return new self($response);
48
        }
49
50
        return $response;
51
    }
52 18
53
    public function __construct(Response $response)
54 18
    {
55
        $this->response = $response;
56 18
57 18
        $this->url = new Url($this->response->getEffectiveUrl());
0 ignored issues
show
It seems like $this->response->getEffectiveUrl() can also be of type null; however, parameter $url of PiedWeb\UrlHarvester\Url::__construct() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

57
        $this->url = new Url(/** @scrutinizer ignore-type */ $this->response->getEffectiveUrl());
Loading history...
58
        $this->urlRequested = new Url($this->response->getUrl());
59
    }
60
61
    public function urlRequested(): Url
62
    {
63 27
        return $this->urlRequested;
64
    }
65 27
66
    /**
67 27
     * Return url response (curl effective url)
68 27
     * // todo : check if urlRequested can be diffenrent than url (depends on curl wrench).
69 27
     */
70
    public function url(): Url
71 6
    {
72
        return $this->url;
73 6
    }
74
75
    public function getUrl(): Url
76
    {
77
        return $this->url;
78
    }
79
80 21
    public function getResponse(): Response
81
    {
82 21
        return $this->response;
83
    }
84
85 15
    /** @psalm-suppress RedundantPropertyInitializationCheck */
86
    public function getDom()
87 15
    {
88
        $this->dom = isset($this->dom) ? $this->dom : new DomCrawler($this->response->getContent());
89
90 18
        return $this->dom;
91
    }
92 18
93
    private function find($selector, $i = null): DomCrawler
94
    {
95 33
        return null !== $i ? $this->getDom()->filter($selector)->eq($i) : $this->getDom()->filter($selector);
96
    }
97 33
98
    /**
99 33
     * Alias for find($selector, 0).
100
     */
101
    private function findOne($selector): DomCrawler
102 21
    {
103
        return $this->find($selector, 0);
104 21
    }
105
106
    /**
107
     * Return content inside a selector.
108
     * Eg.: getTag('title').
109
     *
110 18
     * @return ?string
111
     */
112 18
    public function getTag($selector)
113
    {
114
        $found = $this->findOne($selector);
115
116
        return $found->count() > 0 ? Helper::clean($found->text()) : null;
117
    }
118
119
    public function getUniqueTag($selector = 'title')
120
    {
121 3
        $found = $this->find($selector);
122
123 3
        if (0 === $found->count()) {
124
            return null;
125 3
        }
126
127
        if ($found->count() > 1) {
128 6
            return $found->count().' `'.$selector.'` /!\ ';
129
        }
130 6
131
        return Helper::clean($found->eq(0)->text());
132 6
    }
133 3
134
    /**
135
     * Return content inside a meta.
136 6
     *
137
     * @return string|null from content attribute
138
     */
139
    public function getMeta(string $name): ?string
140 6
    {
141
        $meta = $this->findOne('meta[name='.$name.']');
142
143
        return $meta->count() > 0 ? (null !== $meta->attr('content') ? Helper::clean($meta->attr('content')) : '')
144
            : null;
145
    }
146
147
    /**
148 18
     * Renvoie le contenu de l'attribut href de la balise link rel=canonical.
149
     */
150 18
    public function getCanonical(): ?string
151
    {
152 18
        $canonical = $this->findOne('link[rel=canonical]');
153 18
154
        return $canonical->count() > 0 ? (null !== $canonical->attr('href') ? $canonical->attr('href') : '') : null;
155
    }
156
157
    /*
158
     * @return bool true si canonical = url requested or no canonical balise
159 12
     */
160
    public function isCanonicalCorrect(?string $urlRequested = null): bool
161 12
    {
162
        $canonical = $this->getCanonical();
163 12
164
        if (null === $canonical) {
165
            return true;
166
        }
167
168
        $urlRequested = $urlRequested ?? $this->urlRequested()->get();
169 12
170
        if ($urlRequested == $canonical) {
171 12
            return true;
172
        }
173 12
174
        return $this->checkCanonicalException($urlRequested, $canonical);
175
    }
176 6
177
    private function checkCanonicalException(string $urlRequested, string $canonical): bool
178 6
    {
179 3
        if (false !== preg_match('/^.+?[^\/:](?=[?\/]|$)/', $urlRequested, $match)
180
            && $match[0] === ltrim($urlRequested, '/')
181
            && ($match[0] == $canonical || $match[0].'/' == $canonical)) {
182 3
            return true;
183 3
        }
184 3
185 3
        return false;
186 3
    }
187 3
188
    /** @psalm-suppress RedundantPropertyInitializationCheck */
189
    public function getTextAnalysis()
190 3
    {
191
        if (isset($this->textAnalysis)) {
192 3
            return $this->textAnalysis;
193
        }
194
195 3
        return $this->textAnalysis = $this->getDom()->count() > 0 ? TextAnalyzer::get(
196
            $this->getDom()->text(),
197 3
            true,   // only sentences
198
            1,      // no expression, just words
199
            0      // keep trail
200 3
        ) : null;
201
    }
202 3
203 3
    public function getWordCount(): int
204
    {
205 3
        return (int) str_word_count($this->getDom()->text('') ?? '');
206
    }
207
208
    public function getKws()
209
    {
210
        return $this->getTextAnalysis()->getExpressions(10);
211 3
    }
212
213 3
    public function getRatioTxtCode(): int
214
    {
215 3
        $textLenght = \strlen($this->getDom()->text(''));
216
        $htmlLenght = \strlen(Helper::clean($this->response->getContent()));
217
218
        return (int) ($htmlLenght > 0 ? round($textLenght / $htmlLenght * 100) : 0);
219
    }
220
221
    /**
222 3
     * Return an array of object with two elements Link and anchor.
223
     */
224
    public function getBreadCrumb(?string $separator = null)
225
    {
226
        $breadcrumb = ExtractBreadcrumb::get($this);
227
228 3
        if (null !== $separator && \is_array($breadcrumb)) {
229
            $breadcrumb = array_map(function ($item) {
230 3
                return $item->getCleanName();
231 3
            }, $breadcrumb);
232 3
            $breadcrumb = implode($separator, $breadcrumb);
233 3
        }
234
235
        return $breadcrumb;
236
    }
237
238
    /**
239
     * @return ?string absolute url
240
     */
241
    public function getRedirection(): ?string
242
    {
243
        $headers = $this->response->getHeaders();
244
        $headers = array_change_key_case($headers ?: []);
245
        if (isset($headers['location']) && ExtractLinks::isWebLink($headers['location'])) {
246
            return $this->url()->resolve($headers['location']);
247
        }
248
249
        return null;
250
    }
251
252
    public function getRedirectionLink(): ?Link
253
    {
254
        $redirection = $this->getRedirection();
255
256
        if (null !== $redirection) {
257
            return Link::createRedirection($redirection, $this);
258
        }
259
260 3
        return null;
261
    }
262 3
263 3
    public function isRedirectToHttps(): bool
264 3
    {
265
        $redirUrl = $this->getRedirection();
266
267 3
        return null !== $redirUrl && preg_replace('#^http:#', 'https:', $this->urlRequested()->get(), 1) == $redirUrl;
268
    }
269
270
    /**
271 3
     * Return the value in base tag if exist, else, current Url.
272
     *
273
     * @psalm-suppress RedundantPropertyInitializationCheck
274
     */
275
    public function getBaseUrl(): string
276
    {
277 9
        if (! isset($this->baseUrl)) {
278
            $base = $this->findOne('base');
279 9
            if ($base->getBaseHref() && filter_var($base->getBaseHref(), \FILTER_VALIDATE_URL)) {
280
                $this->baseUrl = $base->getBaseHref();
281
            } else {
282 3
                $this->baseUrl = $this->url()->get();
283
            }
284 3
        }
285
286
        return (string) $this->baseUrl;
287 6
    }
288
289 6
    /**
290
     * @return int correspond to a const from Indexable
291
     */
292 9
    public function indexable(string $userAgent = 'googlebot'): int
293
    {
294 9
        return Indexable::indexable($this, $userAgent);
295 6
    }
296 6
297
    public function isIndexable(string $userAgent = 'googlebot'): bool
298
    {
299 9
        return Indexable::INDEXABLE === $this->indexable($userAgent);
300
    }
301
302
    protected function metaAuthorizeToFollow()
303
    {
304
        return ! (strpos($this->getMeta('googlebot'), 'nofollow') || strpos($this->getMeta('robots'), 'nofollow'));
305
    }
306
307
    /** @psalm-suppress RedundantPropertyInitializationCheck */
308
    public function mayFollow()
309
    {
310
        if (! isset($this->follow)) {
311
            $robotsHeaders = new RobotsHeaders((array) $this->response->getHeaders());
312
            $this->follow = $robotsHeaders->mayFollow() && $this->metaAuthorizeToFollow() ? true : false;
313
        }
314
315
        return $this->follow;
316
    }
317
}
318